krishnakalyan3 commited on
Commit
2d5cb38
1 Parent(s): aa5c8aa

2b445798853676171dbb3172edd48d582d7479694914ad7d7ff176bf0ed5fc43

Browse files
Files changed (6) hide show
  1. model.safetensors +3 -0
  2. optimizer.pt +3 -0
  3. rng_state.pth +3 -0
  4. scheduler.pt +3 -0
  5. trainer_state.json +1585 -0
  6. training_args.bin +3 -0
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0fc3a645a790262e5fff8923eb30cb75bc640a05333b8348b23ff05940be04b1
3
+ size 379087640
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2172b035c05b1f601b9237d2e6b3321bf4c57e3204baea57989e7d077bc8106e
3
+ size 3152480
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8c88c483589e25c71a3212244d6755ea974f610775bad36d935b3df14dc617d
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d8c82186737c1208be5544e2a6dfb5eaf7f4a9ade52220334c78a811e5286d6
3
+ size 1000
trainer_state.json ADDED
@@ -0,0 +1,1585 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.12010584022747946,
3
+ "best_model_checkpoint": "/workspace/disk2/krishna/checkpoints/checkpoint-940",
4
+ "epoch": 0.097,
5
+ "eval_steps": 10,
6
+ "global_step": 970,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.001,
13
+ "grad_norm": 0.0356324203312397,
14
+ "learning_rate": 1e-05,
15
+ "loss": 0.1207,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.001,
20
+ "eval_cos_sim": 0.8792359232902527,
21
+ "eval_loss": 0.12173220255124045,
22
+ "eval_runtime": 171.7728,
23
+ "eval_samples_per_second": 23.287,
24
+ "eval_steps_per_second": 0.367,
25
+ "step": 10
26
+ },
27
+ {
28
+ "epoch": 0.002,
29
+ "grad_norm": 0.030419372022151947,
30
+ "learning_rate": 2e-05,
31
+ "loss": 0.1213,
32
+ "step": 20
33
+ },
34
+ {
35
+ "epoch": 0.002,
36
+ "eval_cos_sim": 0.8789854049682617,
37
+ "eval_loss": 0.12198310520398092,
38
+ "eval_runtime": 159.1521,
39
+ "eval_samples_per_second": 25.133,
40
+ "eval_steps_per_second": 0.396,
41
+ "step": 20
42
+ },
43
+ {
44
+ "epoch": 0.003,
45
+ "grad_norm": 0.033041320741176605,
46
+ "learning_rate": 3e-05,
47
+ "loss": 0.1204,
48
+ "step": 30
49
+ },
50
+ {
51
+ "epoch": 0.003,
52
+ "eval_cos_sim": 0.8790653347969055,
53
+ "eval_loss": 0.12189955379712057,
54
+ "eval_runtime": 161.0741,
55
+ "eval_samples_per_second": 24.833,
56
+ "eval_steps_per_second": 0.391,
57
+ "step": 30
58
+ },
59
+ {
60
+ "epoch": 0.004,
61
+ "grad_norm": 0.04209210351109505,
62
+ "learning_rate": 4e-05,
63
+ "loss": 0.1213,
64
+ "step": 40
65
+ },
66
+ {
67
+ "epoch": 0.004,
68
+ "eval_cos_sim": 0.8792155385017395,
69
+ "eval_loss": 0.12175503399121237,
70
+ "eval_runtime": 159.4228,
71
+ "eval_samples_per_second": 25.091,
72
+ "eval_steps_per_second": 0.395,
73
+ "step": 40
74
+ },
75
+ {
76
+ "epoch": 0.005,
77
+ "grad_norm": 0.03182140365242958,
78
+ "learning_rate": 5e-05,
79
+ "loss": 0.1203,
80
+ "step": 50
81
+ },
82
+ {
83
+ "epoch": 0.005,
84
+ "eval_cos_sim": 0.8791962265968323,
85
+ "eval_loss": 0.12176946785199118,
86
+ "eval_runtime": 160.3207,
87
+ "eval_samples_per_second": 24.95,
88
+ "eval_steps_per_second": 0.393,
89
+ "step": 50
90
+ },
91
+ {
92
+ "epoch": 0.006,
93
+ "grad_norm": 0.05823719501495361,
94
+ "learning_rate": 2.4802665827257164e-05,
95
+ "loss": 0.1213,
96
+ "step": 60
97
+ },
98
+ {
99
+ "epoch": 0.006,
100
+ "eval_cos_sim": 0.8791635036468506,
101
+ "eval_loss": 0.12179789688336325,
102
+ "eval_runtime": 164.7575,
103
+ "eval_samples_per_second": 24.278,
104
+ "eval_steps_per_second": 0.382,
105
+ "step": 60
106
+ },
107
+ {
108
+ "epoch": 0.007,
109
+ "grad_norm": 0.02305755950510502,
110
+ "learning_rate": 4.999688473794144e-05,
111
+ "loss": 0.1211,
112
+ "step": 70
113
+ },
114
+ {
115
+ "epoch": 0.007,
116
+ "eval_cos_sim": 0.8792662024497986,
117
+ "eval_loss": 0.12169700815426779,
118
+ "eval_runtime": 159.8158,
119
+ "eval_samples_per_second": 25.029,
120
+ "eval_steps_per_second": 0.394,
121
+ "step": 70
122
+ },
123
+ {
124
+ "epoch": 0.008,
125
+ "grad_norm": 0.03906348720192909,
126
+ "learning_rate": 2.4408046661584414e-05,
127
+ "loss": 0.1201,
128
+ "step": 80
129
+ },
130
+ {
131
+ "epoch": 0.008,
132
+ "eval_cos_sim": 0.879157543182373,
133
+ "eval_loss": 0.12181253530728293,
134
+ "eval_runtime": 162.3324,
135
+ "eval_samples_per_second": 24.641,
136
+ "eval_steps_per_second": 0.388,
137
+ "step": 80
138
+ },
139
+ {
140
+ "epoch": 0.009,
141
+ "grad_norm": 0.0275803804397583,
142
+ "learning_rate": 4.998753972815434e-05,
143
+ "loss": 0.1208,
144
+ "step": 90
145
+ },
146
+ {
147
+ "epoch": 0.009,
148
+ "eval_cos_sim": 0.8792516589164734,
149
+ "eval_loss": 0.121718086264009,
150
+ "eval_runtime": 158.5035,
151
+ "eval_samples_per_second": 25.236,
152
+ "eval_steps_per_second": 0.397,
153
+ "step": 90
154
+ },
155
+ {
156
+ "epoch": 0.01,
157
+ "grad_norm": 0.03248042240738869,
158
+ "learning_rate": 2.4013575023093667e-05,
159
+ "loss": 0.1224,
160
+ "step": 100
161
+ },
162
+ {
163
+ "epoch": 0.01,
164
+ "eval_cos_sim": 0.879462718963623,
165
+ "eval_loss": 0.1215014844154067,
166
+ "eval_runtime": 159.5892,
167
+ "eval_samples_per_second": 25.064,
168
+ "eval_steps_per_second": 0.395,
169
+ "step": 100
170
+ },
171
+ {
172
+ "epoch": 0.011,
173
+ "grad_norm": 0.03436814621090889,
174
+ "learning_rate": 4.9971967299611097e-05,
175
+ "loss": 0.1205,
176
+ "step": 110
177
+ },
178
+ {
179
+ "epoch": 0.011,
180
+ "eval_cos_sim": 0.8795427680015564,
181
+ "eval_loss": 0.1214234667037673,
182
+ "eval_runtime": 168.2892,
183
+ "eval_samples_per_second": 23.769,
184
+ "eval_steps_per_second": 0.374,
185
+ "step": 110
186
+ },
187
+ {
188
+ "epoch": 0.012,
189
+ "grad_norm": 0.03663235530257225,
190
+ "learning_rate": 2.3619349222387287e-05,
191
+ "loss": 0.1209,
192
+ "step": 120
193
+ },
194
+ {
195
+ "epoch": 0.012,
196
+ "eval_cos_sim": 0.8793898224830627,
197
+ "eval_loss": 0.1215821056579299,
198
+ "eval_runtime": 170.7269,
199
+ "eval_samples_per_second": 23.429,
200
+ "eval_steps_per_second": 0.369,
201
+ "step": 120
202
+ },
203
+ {
204
+ "epoch": 0.013,
205
+ "grad_norm": 0.03549114614725113,
206
+ "learning_rate": 4.9950171333287335e-05,
207
+ "loss": 0.1218,
208
+ "step": 130
209
+ },
210
+ {
211
+ "epoch": 0.013,
212
+ "eval_cos_sim": 0.8795300722122192,
213
+ "eval_loss": 0.12144066002118063,
214
+ "eval_runtime": 162.7257,
215
+ "eval_samples_per_second": 24.581,
216
+ "eval_steps_per_second": 0.387,
217
+ "step": 130
218
+ },
219
+ {
220
+ "epoch": 0.014,
221
+ "grad_norm": 0.03164505586028099,
222
+ "learning_rate": 2.3225467508799633e-05,
223
+ "loss": 0.1208,
224
+ "step": 140
225
+ },
226
+ {
227
+ "epoch": 0.014,
228
+ "eval_cos_sim": 0.8797659873962402,
229
+ "eval_loss": 0.12119961311566306,
230
+ "eval_runtime": 163.5115,
231
+ "eval_samples_per_second": 24.463,
232
+ "eval_steps_per_second": 0.385,
233
+ "step": 140
234
+ },
235
+ {
236
+ "epoch": 0.015,
237
+ "grad_norm": 0.031108180060982704,
238
+ "learning_rate": 4.992215726119483e-05,
239
+ "loss": 0.1213,
240
+ "step": 150
241
+ },
242
+ {
243
+ "epoch": 0.015,
244
+ "eval_cos_sim": 0.8797353506088257,
245
+ "eval_loss": 0.1212306742881484,
246
+ "eval_runtime": 165.866,
247
+ "eval_samples_per_second": 24.116,
248
+ "eval_steps_per_second": 0.38,
249
+ "step": 150
250
+ },
251
+ {
252
+ "epoch": 0.016,
253
+ "grad_norm": 0.030103642493486404,
254
+ "learning_rate": 2.2832028045911203e-05,
255
+ "loss": 0.1209,
256
+ "step": 160
257
+ },
258
+ {
259
+ "epoch": 0.016,
260
+ "eval_cos_sim": 0.8793777823448181,
261
+ "eval_loss": 0.12159336998211813,
262
+ "eval_runtime": 171.7298,
263
+ "eval_samples_per_second": 23.292,
264
+ "eval_steps_per_second": 0.367,
265
+ "step": 160
266
+ },
267
+ {
268
+ "epoch": 0.017,
269
+ "grad_norm": 0.05055614188313484,
270
+ "learning_rate": 4.9887932065027656e-05,
271
+ "loss": 0.1204,
272
+ "step": 170
273
+ },
274
+ {
275
+ "epoch": 0.017,
276
+ "eval_cos_sim": 0.8795183300971985,
277
+ "eval_loss": 0.1214520344947524,
278
+ "eval_runtime": 162.2461,
279
+ "eval_samples_per_second": 24.654,
280
+ "eval_steps_per_second": 0.388,
281
+ "step": 170
282
+ },
283
+ {
284
+ "epoch": 0.018,
285
+ "grad_norm": 0.03837039694190025,
286
+ "learning_rate": 2.2439128887084646e-05,
287
+ "loss": 0.1202,
288
+ "step": 180
289
+ },
290
+ {
291
+ "epoch": 0.018,
292
+ "eval_cos_sim": 0.8797397017478943,
293
+ "eval_loss": 0.12122445728527975,
294
+ "eval_runtime": 161.451,
295
+ "eval_samples_per_second": 24.775,
296
+ "eval_steps_per_second": 0.39,
297
+ "step": 180
298
+ },
299
+ {
300
+ "epoch": 0.019,
301
+ "grad_norm": 0.03563898801803589,
302
+ "learning_rate": 4.98475042744222e-05,
303
+ "loss": 0.1221,
304
+ "step": 190
305
+ },
306
+ {
307
+ "epoch": 0.019,
308
+ "eval_cos_sim": 0.8797785639762878,
309
+ "eval_loss": 0.12118578769909812,
310
+ "eval_runtime": 157.9064,
311
+ "eval_samples_per_second": 25.331,
312
+ "eval_steps_per_second": 0.399,
313
+ "step": 190
314
+ },
315
+ {
316
+ "epoch": 0.02,
317
+ "grad_norm": 0.0392858162522316,
318
+ "learning_rate": 2.204686795102736e-05,
319
+ "loss": 0.1204,
320
+ "step": 200
321
+ },
322
+ {
323
+ "epoch": 0.02,
324
+ "eval_cos_sim": 0.8796395063400269,
325
+ "eval_loss": 0.12133027555691672,
326
+ "eval_runtime": 163.7884,
327
+ "eval_samples_per_second": 24.422,
328
+ "eval_steps_per_second": 0.385,
329
+ "step": 200
330
+ },
331
+ {
332
+ "epoch": 0.021,
333
+ "grad_norm": 0.04556349664926529,
334
+ "learning_rate": 4.980088396483144e-05,
335
+ "loss": 0.1205,
336
+ "step": 210
337
+ },
338
+ {
339
+ "epoch": 0.021,
340
+ "eval_cos_sim": 0.8796445727348328,
341
+ "eval_loss": 0.12132433941113424,
342
+ "eval_runtime": 164.6589,
343
+ "eval_samples_per_second": 24.293,
344
+ "eval_steps_per_second": 0.383,
345
+ "step": 210
346
+ },
347
+ {
348
+ "epoch": 0.022,
349
+ "grad_norm": 0.030130930244922638,
350
+ "learning_rate": 2.1655342997387947e-05,
351
+ "loss": 0.1201,
352
+ "step": 220
353
+ },
354
+ {
355
+ "epoch": 0.022,
356
+ "eval_cos_sim": 0.8796879649162292,
357
+ "eval_loss": 0.12127337600933981,
358
+ "eval_runtime": 163.052,
359
+ "eval_samples_per_second": 24.532,
360
+ "eval_steps_per_second": 0.386,
361
+ "step": 220
362
+ },
363
+ {
364
+ "epoch": 0.023,
365
+ "grad_norm": 0.027453621849417686,
366
+ "learning_rate": 4.9748082755013934e-05,
367
+ "loss": 0.1205,
368
+ "step": 230
369
+ },
370
+ {
371
+ "epoch": 0.023,
372
+ "eval_cos_sim": 0.8797481060028076,
373
+ "eval_loss": 0.12121248771893454,
374
+ "eval_runtime": 159.6988,
375
+ "eval_samples_per_second": 25.047,
376
+ "eval_steps_per_second": 0.394,
377
+ "step": 230
378
+ },
379
+ {
380
+ "epoch": 0.024,
381
+ "grad_norm": 0.029768602922558784,
382
+ "learning_rate": 2.126465160239341e-05,
383
+ "loss": 0.1206,
384
+ "step": 240
385
+ },
386
+ {
387
+ "epoch": 0.024,
388
+ "eval_cos_sim": 0.8797679543495178,
389
+ "eval_loss": 0.12119435026394797,
390
+ "eval_runtime": 170.5172,
391
+ "eval_samples_per_second": 23.458,
392
+ "eval_steps_per_second": 0.369,
393
+ "step": 240
394
+ },
395
+ {
396
+ "epoch": 0.025,
397
+ "grad_norm": 0.025975426658988,
398
+ "learning_rate": 4.968911380413809e-05,
399
+ "loss": 0.1206,
400
+ "step": 250
401
+ },
402
+ {
403
+ "epoch": 0.025,
404
+ "eval_cos_sim": 0.8798050284385681,
405
+ "eval_loss": 0.12115855839001609,
406
+ "eval_runtime": 162.5635,
407
+ "eval_samples_per_second": 24.606,
408
+ "eval_steps_per_second": 0.388,
409
+ "step": 250
410
+ },
411
+ {
412
+ "epoch": 0.026,
413
+ "grad_norm": 0.032136961817741394,
414
+ "learning_rate": 2.0874891134530094e-05,
415
+ "loss": 0.1207,
416
+ "step": 260
417
+ },
418
+ {
419
+ "epoch": 0.026,
420
+ "eval_cos_sim": 0.8799233436584473,
421
+ "eval_loss": 0.12104250910031271,
422
+ "eval_runtime": 171.0657,
423
+ "eval_samples_per_second": 23.383,
424
+ "eval_steps_per_second": 0.368,
425
+ "step": 260
426
+ },
427
+ {
428
+ "epoch": 0.027,
429
+ "grad_norm": 0.035989198833703995,
430
+ "learning_rate": 4.962399180850275e-05,
431
+ "loss": 0.12,
432
+ "step": 270
433
+ },
434
+ {
435
+ "epoch": 0.027,
436
+ "eval_cos_sim": 0.8800029754638672,
437
+ "eval_loss": 0.12096367742764426,
438
+ "eval_runtime": 162.5704,
439
+ "eval_samples_per_second": 24.605,
440
+ "eval_steps_per_second": 0.388,
441
+ "step": 270
442
+ },
443
+ {
444
+ "epoch": 0.028,
445
+ "grad_norm": 0.02917526848614216,
446
+ "learning_rate": 2.0486158730277393e-05,
447
+ "loss": 0.1205,
448
+ "step": 280
449
+ },
450
+ {
451
+ "epoch": 0.028,
452
+ "eval_cos_sim": 0.8800209164619446,
453
+ "eval_loss": 0.12094438698040914,
454
+ "eval_runtime": 163.135,
455
+ "eval_samples_per_second": 24.52,
456
+ "eval_steps_per_second": 0.386,
457
+ "step": 280
458
+ },
459
+ {
460
+ "epoch": 0.029,
461
+ "grad_norm": 0.040587518364191055,
462
+ "learning_rate": 4.955273299787453e-05,
463
+ "loss": 0.1204,
464
+ "step": 290
465
+ },
466
+ {
467
+ "epoch": 0.029,
468
+ "eval_cos_sim": 0.8800665140151978,
469
+ "eval_loss": 0.12090009071576072,
470
+ "eval_runtime": 160.8422,
471
+ "eval_samples_per_second": 24.869,
472
+ "eval_steps_per_second": 0.392,
473
+ "step": 290
474
+ },
475
+ {
476
+ "epoch": 0.03,
477
+ "grad_norm": 0.02535935305058956,
478
+ "learning_rate": 2.00985512699005e-05,
479
+ "loss": 0.121,
480
+ "step": 300
481
+ },
482
+ {
483
+ "epoch": 0.03,
484
+ "eval_cos_sim": 0.8799148201942444,
485
+ "eval_loss": 0.12105432750927878,
486
+ "eval_runtime": 162.6443,
487
+ "eval_samples_per_second": 24.594,
488
+ "eval_steps_per_second": 0.387,
489
+ "step": 300
490
+ },
491
+ {
492
+ "epoch": 0.031,
493
+ "grad_norm": 0.027923179790377617,
494
+ "learning_rate": 4.947535513144286e-05,
495
+ "loss": 0.1197,
496
+ "step": 310
497
+ },
498
+ {
499
+ "epoch": 0.031,
500
+ "eval_cos_sim": 0.8799843788146973,
501
+ "eval_loss": 0.120985016367311,
502
+ "eval_runtime": 165.6185,
503
+ "eval_samples_per_second": 24.152,
504
+ "eval_steps_per_second": 0.38,
505
+ "step": 310
506
+ },
507
+ {
508
+ "epoch": 0.032,
509
+ "grad_norm": 0.025140805169939995,
510
+ "learning_rate": 1.9712165353304617e-05,
511
+ "loss": 0.1199,
512
+ "step": 320
513
+ },
514
+ {
515
+ "epoch": 0.032,
516
+ "eval_cos_sim": 0.8800230622291565,
517
+ "eval_loss": 0.1209463920806594,
518
+ "eval_runtime": 161.2564,
519
+ "eval_samples_per_second": 24.805,
520
+ "eval_steps_per_second": 0.391,
521
+ "step": 320
522
+ },
523
+ {
524
+ "epoch": 0.033,
525
+ "grad_norm": 0.03448393940925598,
526
+ "learning_rate": 4.9391877493394335e-05,
527
+ "loss": 0.1205,
528
+ "step": 330
529
+ },
530
+ {
531
+ "epoch": 0.033,
532
+ "eval_cos_sim": 0.8800433278083801,
533
+ "eval_loss": 0.12092636968838645,
534
+ "eval_runtime": 165.583,
535
+ "eval_samples_per_second": 24.157,
536
+ "eval_steps_per_second": 0.38,
537
+ "step": 330
538
+ },
539
+ {
540
+ "epoch": 0.034,
541
+ "grad_norm": 0.027445893734693527,
542
+ "learning_rate": 1.9327097275960212e-05,
543
+ "loss": 0.1208,
544
+ "step": 340
545
+ },
546
+ {
547
+ "epoch": 0.034,
548
+ "eval_cos_sim": 0.8797867894172668,
549
+ "eval_loss": 0.12118107464062644,
550
+ "eval_runtime": 169.9936,
551
+ "eval_samples_per_second": 23.53,
552
+ "eval_steps_per_second": 0.371,
553
+ "step": 340
554
+ },
555
+ {
556
+ "epoch": 0.035,
557
+ "grad_norm": 0.032430149614810944,
558
+ "learning_rate": 4.9302320888106454e-05,
559
+ "loss": 0.1192,
560
+ "step": 350
561
+ },
562
+ {
563
+ "epoch": 0.035,
564
+ "eval_cos_sim": 0.8799253106117249,
565
+ "eval_loss": 0.12104199745404196,
566
+ "eval_runtime": 162.0388,
567
+ "eval_samples_per_second": 24.685,
568
+ "eval_steps_per_second": 0.389,
569
+ "step": 350
570
+ },
571
+ {
572
+ "epoch": 0.036,
573
+ "grad_norm": 0.03066575713455677,
574
+ "learning_rate": 1.894344300490539e-05,
575
+ "loss": 0.1207,
576
+ "step": 360
577
+ },
578
+ {
579
+ "epoch": 0.036,
580
+ "eval_cos_sim": 0.8800117373466492,
581
+ "eval_loss": 0.12095709469067527,
582
+ "eval_runtime": 168.2521,
583
+ "eval_samples_per_second": 23.774,
584
+ "eval_steps_per_second": 0.374,
585
+ "step": 360
586
+ },
587
+ {
588
+ "epoch": 0.037,
589
+ "grad_norm": 0.04023744910955429,
590
+ "learning_rate": 4.920670763496264e-05,
591
+ "loss": 0.1206,
592
+ "step": 370
593
+ },
594
+ {
595
+ "epoch": 0.037,
596
+ "eval_cos_sim": 0.8800341486930847,
597
+ "eval_loss": 0.12093350794064474,
598
+ "eval_runtime": 164.2446,
599
+ "eval_samples_per_second": 24.354,
600
+ "eval_steps_per_second": 0.384,
601
+ "step": 370
602
+ },
603
+ {
604
+ "epoch": 0.038,
605
+ "grad_norm": 0.03345053270459175,
606
+ "learning_rate": 1.8561298154827563e-05,
607
+ "loss": 0.1207,
608
+ "step": 380
609
+ },
610
+ {
611
+ "epoch": 0.038,
612
+ "eval_cos_sim": 0.8800336122512817,
613
+ "eval_loss": 0.12093256904828024,
614
+ "eval_runtime": 158.5429,
615
+ "eval_samples_per_second": 25.23,
616
+ "eval_steps_per_second": 0.397,
617
+ "step": 380
618
+ },
619
+ {
620
+ "epoch": 0.039,
621
+ "grad_norm": 0.02383916825056076,
622
+ "learning_rate": 4.910506156279026e-05,
623
+ "loss": 0.1213,
624
+ "step": 390
625
+ },
626
+ {
627
+ "epoch": 0.039,
628
+ "eval_cos_sim": 0.8800181150436401,
629
+ "eval_loss": 0.12094816543805074,
630
+ "eval_runtime": 164.6828,
631
+ "eval_samples_per_second": 24.289,
632
+ "eval_steps_per_second": 0.383,
633
+ "step": 390
634
+ },
635
+ {
636
+ "epoch": 0.04,
637
+ "grad_norm": 0.03217790648341179,
638
+ "learning_rate": 1.8180757964234907e-05,
639
+ "loss": 0.1213,
640
+ "step": 400
641
+ },
642
+ {
643
+ "epoch": 0.04,
644
+ "eval_cos_sim": 0.8800681829452515,
645
+ "eval_loss": 0.12089718677746726,
646
+ "eval_runtime": 162.9873,
647
+ "eval_samples_per_second": 24.542,
648
+ "eval_steps_per_second": 0.387,
649
+ "step": 400
650
+ },
651
+ {
652
+ "epoch": 0.041,
653
+ "grad_norm": 0.03514571115374565,
654
+ "learning_rate": 4.8997408003921466e-05,
655
+ "loss": 0.1208,
656
+ "step": 410
657
+ },
658
+ {
659
+ "epoch": 0.041,
660
+ "eval_cos_sim": 0.8801241517066956,
661
+ "eval_loss": 0.12084018089520407,
662
+ "eval_runtime": 167.298,
663
+ "eval_samples_per_second": 23.909,
664
+ "eval_steps_per_second": 0.377,
665
+ "step": 410
666
+ },
667
+ {
668
+ "epoch": 0.042,
669
+ "grad_norm": 0.03063860908150673,
670
+ "learning_rate": 1.780191727172083e-05,
671
+ "loss": 0.1207,
672
+ "step": 420
673
+ },
674
+ {
675
+ "epoch": 0.042,
676
+ "eval_cos_sim": 0.8799417018890381,
677
+ "eval_loss": 0.12102477314221335,
678
+ "eval_runtime": 165.8735,
679
+ "eval_samples_per_second": 24.115,
680
+ "eval_steps_per_second": 0.38,
681
+ "step": 420
682
+ },
683
+ {
684
+ "epoch": 0.043,
685
+ "grad_norm": 0.0319770872592926,
686
+ "learning_rate": 4.8883773787879826e-05,
687
+ "loss": 0.1205,
688
+ "step": 430
689
+ },
690
+ {
691
+ "epoch": 0.043,
692
+ "eval_cos_sim": 0.8800466060638428,
693
+ "eval_loss": 0.12091960479962302,
694
+ "eval_runtime": 163.3913,
695
+ "eval_samples_per_second": 24.481,
696
+ "eval_steps_per_second": 0.386,
697
+ "step": 430
698
+ },
699
+ {
700
+ "epoch": 0.044,
701
+ "grad_norm": 0.02543482929468155,
702
+ "learning_rate": 1.742487049232818e-05,
703
+ "loss": 0.1202,
704
+ "step": 440
705
+ },
706
+ {
707
+ "epoch": 0.044,
708
+ "eval_cos_sim": 0.8802583813667297,
709
+ "eval_loss": 0.12070597412335349,
710
+ "eval_runtime": 160.473,
711
+ "eval_samples_per_second": 24.926,
712
+ "eval_steps_per_second": 0.393,
713
+ "step": 440
714
+ },
715
+ {
716
+ "epoch": 0.045,
717
+ "grad_norm": 0.024107394739985466,
718
+ "learning_rate": 4.876418723469453e-05,
719
+ "loss": 0.1196,
720
+ "step": 450
721
+ },
722
+ {
723
+ "epoch": 0.045,
724
+ "eval_cos_sim": 0.8802623748779297,
725
+ "eval_loss": 0.12070202877270651,
726
+ "eval_runtime": 168.2567,
727
+ "eval_samples_per_second": 23.773,
728
+ "eval_steps_per_second": 0.374,
729
+ "step": 450
730
+ },
731
+ {
732
+ "epoch": 0.046,
733
+ "grad_norm": 0.04505016654729843,
734
+ "learning_rate": 1.7049711594019046e-05,
735
+ "loss": 0.1197,
736
+ "step": 460
737
+ },
738
+ {
739
+ "epoch": 0.046,
740
+ "eval_cos_sim": 0.8800992965698242,
741
+ "eval_loss": 0.12086664869534446,
742
+ "eval_runtime": 168.3415,
743
+ "eval_samples_per_second": 23.761,
744
+ "eval_steps_per_second": 0.374,
745
+ "step": 460
746
+ },
747
+ {
748
+ "epoch": 0.047,
749
+ "grad_norm": 0.026298915967345238,
750
+ "learning_rate": 4.8638678147841726e-05,
751
+ "loss": 0.1207,
752
+ "step": 470
753
+ },
754
+ {
755
+ "epoch": 0.047,
756
+ "eval_cos_sim": 0.8801872134208679,
757
+ "eval_loss": 0.12077882673489523,
758
+ "eval_runtime": 171.1708,
759
+ "eval_samples_per_second": 23.368,
760
+ "eval_steps_per_second": 0.368,
761
+ "step": 470
762
+ },
763
+ {
764
+ "epoch": 0.048,
765
+ "grad_norm": 0.04072026535868645,
766
+ "learning_rate": 1.667653407425599e-05,
767
+ "loss": 0.12,
768
+ "step": 480
769
+ },
770
+ {
771
+ "epoch": 0.048,
772
+ "eval_cos_sim": 0.8804126977920532,
773
+ "eval_loss": 0.12055267622219992,
774
+ "eval_runtime": 160.8906,
775
+ "eval_samples_per_second": 24.862,
776
+ "eval_steps_per_second": 0.392,
777
+ "step": 480
778
+ },
779
+ {
780
+ "epoch": 0.049,
781
+ "grad_norm": 0.02353891357779503,
782
+ "learning_rate": 4.850727780681685e-05,
783
+ "loss": 0.121,
784
+ "step": 490
785
+ },
786
+ {
787
+ "epoch": 0.049,
788
+ "eval_cos_sim": 0.8802867531776428,
789
+ "eval_loss": 0.12067857982861471,
790
+ "eval_runtime": 162.0814,
791
+ "eval_samples_per_second": 24.679,
792
+ "eval_steps_per_second": 0.389,
793
+ "step": 490
794
+ },
795
+ {
796
+ "epoch": 0.05,
797
+ "grad_norm": 0.03163010999560356,
798
+ "learning_rate": 1.6305430936700462e-05,
799
+ "loss": 0.1206,
800
+ "step": 500
801
+ },
802
+ {
803
+ "epoch": 0.05,
804
+ "eval_cos_sim": 0.8799078464508057,
805
+ "eval_loss": 0.12105564882504416,
806
+ "eval_runtime": 159.7041,
807
+ "eval_samples_per_second": 25.046,
808
+ "eval_steps_per_second": 0.394,
809
+ "step": 500
810
+ },
811
+ {
812
+ "epoch": 0.051,
813
+ "grad_norm": 0.03480914607644081,
814
+ "learning_rate": 4.8370018959339916e-05,
815
+ "loss": 0.1193,
816
+ "step": 510
817
+ },
818
+ {
819
+ "epoch": 0.051,
820
+ "eval_cos_sim": 0.8801398873329163,
821
+ "eval_loss": 0.12082168438183737,
822
+ "eval_runtime": 168.1565,
823
+ "eval_samples_per_second": 23.787,
824
+ "eval_steps_per_second": 0.375,
825
+ "step": 510
826
+ },
827
+ {
828
+ "epoch": 0.052,
829
+ "grad_norm": 0.031566403806209564,
830
+ "learning_rate": 1.5936494668034417e-05,
831
+ "loss": 0.1207,
832
+ "step": 520
833
+ },
834
+ {
835
+ "epoch": 0.052,
836
+ "eval_cos_sim": 0.8804723024368286,
837
+ "eval_loss": 0.12048741771923971,
838
+ "eval_runtime": 162.0779,
839
+ "eval_samples_per_second": 24.679,
840
+ "eval_steps_per_second": 0.389,
841
+ "step": 520
842
+ },
843
+ {
844
+ "epoch": 0.053,
845
+ "grad_norm": 0.02134857140481472,
846
+ "learning_rate": 4.822693581319333e-05,
847
+ "loss": 0.1207,
848
+ "step": 530
849
+ },
850
+ {
851
+ "epoch": 0.053,
852
+ "eval_cos_sim": 0.8804084062576294,
853
+ "eval_loss": 0.12055278637158347,
854
+ "eval_runtime": 165.0027,
855
+ "eval_samples_per_second": 24.242,
856
+ "eval_steps_per_second": 0.382,
857
+ "step": 530
858
+ },
859
+ {
860
+ "epoch": 0.054,
861
+ "grad_norm": 0.02998766116797924,
862
+ "learning_rate": 1.5569817214910634e-05,
863
+ "loss": 0.1206,
864
+ "step": 540
865
+ },
866
+ {
867
+ "epoch": 0.054,
868
+ "eval_cos_sim": 0.879996657371521,
869
+ "eval_loss": 0.12096652509915305,
870
+ "eval_runtime": 269.6523,
871
+ "eval_samples_per_second": 14.834,
872
+ "eval_steps_per_second": 0.234,
873
+ "step": 540
874
+ },
875
+ {
876
+ "epoch": 0.055,
877
+ "grad_norm": 0.023394938558340073,
878
+ "learning_rate": 4.807806402769648e-05,
879
+ "loss": 0.1204,
880
+ "step": 550
881
+ },
882
+ {
883
+ "epoch": 0.055,
884
+ "eval_cos_sim": 0.8802942037582397,
885
+ "eval_loss": 0.12066655971753074,
886
+ "eval_runtime": 241.2043,
887
+ "eval_samples_per_second": 16.583,
888
+ "eval_steps_per_second": 0.261,
889
+ "step": 550
890
+ },
891
+ {
892
+ "epoch": 0.056,
893
+ "grad_norm": 0.04035342484712601,
894
+ "learning_rate": 1.520548996103771e-05,
895
+ "loss": 0.1208,
896
+ "step": 560
897
+ },
898
+ {
899
+ "epoch": 0.056,
900
+ "eval_cos_sim": 0.8805367946624756,
901
+ "eval_loss": 0.12042092802273703,
902
+ "eval_runtime": 217.8859,
903
+ "eval_samples_per_second": 18.358,
904
+ "eval_steps_per_second": 0.289,
905
+ "step": 560
906
+ },
907
+ {
908
+ "epoch": 0.057,
909
+ "grad_norm": 0.02704194188117981,
910
+ "learning_rate": 4.7923440704819685e-05,
911
+ "loss": 0.1205,
912
+ "step": 570
913
+ },
914
+ {
915
+ "epoch": 0.057,
916
+ "eval_cos_sim": 0.8805016875267029,
917
+ "eval_loss": 0.12045616819607688,
918
+ "eval_runtime": 163.5639,
919
+ "eval_samples_per_second": 24.455,
920
+ "eval_steps_per_second": 0.385,
921
+ "step": 570
922
+ },
923
+ {
924
+ "epoch": 0.058,
925
+ "grad_norm": 0.041525471955537796,
926
+ "learning_rate": 1.4843603704405321e-05,
927
+ "loss": 0.1209,
928
+ "step": 580
929
+ },
930
+ {
931
+ "epoch": 0.058,
932
+ "eval_cos_sim": 0.8803950548171997,
933
+ "eval_loss": 0.12056205751645041,
934
+ "eval_runtime": 163.8454,
935
+ "eval_samples_per_second": 24.413,
936
+ "eval_steps_per_second": 0.385,
937
+ "step": 580
938
+ },
939
+ {
940
+ "epoch": 0.059,
941
+ "grad_norm": 0.02588295191526413,
942
+ "learning_rate": 4.7763104379936636e-05,
943
+ "loss": 0.12,
944
+ "step": 590
945
+ },
946
+ {
947
+ "epoch": 0.059,
948
+ "eval_cos_sim": 0.8804982304573059,
949
+ "eval_loss": 0.12045794346081687,
950
+ "eval_runtime": 175.2999,
951
+ "eval_samples_per_second": 22.818,
952
+ "eval_steps_per_second": 0.359,
953
+ "step": 590
954
+ },
955
+ {
956
+ "epoch": 0.06,
957
+ "grad_norm": 0.030644405633211136,
958
+ "learning_rate": 1.4484248634655188e-05,
959
+ "loss": 0.1211,
960
+ "step": 600
961
+ },
962
+ {
963
+ "epoch": 0.06,
964
+ "eval_cos_sim": 0.8804518580436707,
965
+ "eval_loss": 0.12050184681164694,
966
+ "eval_runtime": 165.1748,
967
+ "eval_samples_per_second": 24.217,
968
+ "eval_steps_per_second": 0.381,
969
+ "step": 600
970
+ },
971
+ {
972
+ "epoch": 0.061,
973
+ "grad_norm": 0.03162102401256561,
974
+ "learning_rate": 4.7597095012220556e-05,
975
+ "loss": 0.1194,
976
+ "step": 610
977
+ },
978
+ {
979
+ "epoch": 0.061,
980
+ "eval_cos_sim": 0.8805278539657593,
981
+ "eval_loss": 0.12042546226727438,
982
+ "eval_runtime": 169.0168,
983
+ "eval_samples_per_second": 23.666,
984
+ "eval_steps_per_second": 0.373,
985
+ "step": 610
986
+ },
987
+ {
988
+ "epoch": 0.062,
989
+ "grad_norm": 0.030952898785471916,
990
+ "learning_rate": 1.4127514310605238e-05,
991
+ "loss": 0.1202,
992
+ "step": 620
993
+ },
994
+ {
995
+ "epoch": 0.062,
996
+ "eval_cos_sim": 0.880526602268219,
997
+ "eval_loss": 0.12043014385449362,
998
+ "eval_runtime": 161.73,
999
+ "eval_samples_per_second": 24.733,
1000
+ "eval_steps_per_second": 0.39,
1001
+ "step": 620
1002
+ },
1003
+ {
1004
+ "epoch": 0.063,
1005
+ "grad_norm": 0.025900695472955704,
1006
+ "learning_rate": 4.742545397468656e-05,
1007
+ "loss": 0.1205,
1008
+ "step": 630
1009
+ },
1010
+ {
1011
+ "epoch": 0.063,
1012
+ "eval_cos_sim": 0.8804138898849487,
1013
+ "eval_loss": 0.12054368068921043,
1014
+ "eval_runtime": 169.1165,
1015
+ "eval_samples_per_second": 23.652,
1016
+ "eval_steps_per_second": 0.373,
1017
+ "step": 630
1018
+ },
1019
+ {
1020
+ "epoch": 0.064,
1021
+ "grad_norm": 0.02121679112315178,
1022
+ "learning_rate": 1.3773489637927061e-05,
1023
+ "loss": 0.1208,
1024
+ "step": 640
1025
+ },
1026
+ {
1027
+ "epoch": 0.064,
1028
+ "eval_cos_sim": 0.8801329731941223,
1029
+ "eval_loss": 0.120824953577394,
1030
+ "eval_runtime": 161.444,
1031
+ "eval_samples_per_second": 24.776,
1032
+ "eval_steps_per_second": 0.39,
1033
+ "step": 640
1034
+ },
1035
+ {
1036
+ "epoch": 0.065,
1037
+ "grad_norm": 0.02153482660651207,
1038
+ "learning_rate": 4.7248224043879605e-05,
1039
+ "loss": 0.1211,
1040
+ "step": 650
1041
+ },
1042
+ {
1043
+ "epoch": 0.065,
1044
+ "eval_cos_sim": 0.8802359104156494,
1045
+ "eval_loss": 0.12072229530560447,
1046
+ "eval_runtime": 161.7761,
1047
+ "eval_samples_per_second": 24.726,
1048
+ "eval_steps_per_second": 0.389,
1049
+ "step": 650
1050
+ },
1051
+ {
1052
+ "epoch": 0.066,
1053
+ "grad_norm": 0.030305592343211174,
1054
+ "learning_rate": 1.342226284699112e-05,
1055
+ "loss": 0.1202,
1056
+ "step": 660
1057
+ },
1058
+ {
1059
+ "epoch": 0.066,
1060
+ "eval_cos_sim": 0.8804138898849487,
1061
+ "eval_loss": 0.12054470445858909,
1062
+ "eval_runtime": 166.8157,
1063
+ "eval_samples_per_second": 23.979,
1064
+ "eval_steps_per_second": 0.378,
1065
+ "step": 660
1066
+ },
1067
+ {
1068
+ "epoch": 0.067,
1069
+ "grad_norm": 0.03320358693599701,
1070
+ "learning_rate": 4.7065449389213644e-05,
1071
+ "loss": 0.1216,
1072
+ "step": 670
1073
+ },
1074
+ {
1075
+ "epoch": 0.067,
1076
+ "eval_cos_sim": 0.8803730607032776,
1077
+ "eval_loss": 0.12058561565625144,
1078
+ "eval_runtime": 168.9555,
1079
+ "eval_samples_per_second": 23.675,
1080
+ "eval_steps_per_second": 0.373,
1081
+ "step": 670
1082
+ },
1083
+ {
1084
+ "epoch": 0.068,
1085
+ "grad_norm": 0.03419356420636177,
1086
+ "learning_rate": 1.3073921470878081e-05,
1087
+ "loss": 0.1197,
1088
+ "step": 680
1089
+ },
1090
+ {
1091
+ "epoch": 0.068,
1092
+ "eval_cos_sim": 0.8803737163543701,
1093
+ "eval_loss": 0.12058660843121481,
1094
+ "eval_runtime": 162.073,
1095
+ "eval_samples_per_second": 24.68,
1096
+ "eval_steps_per_second": 0.389,
1097
+ "step": 680
1098
+ },
1099
+ {
1100
+ "epoch": 0.069,
1101
+ "grad_norm": 0.022392097860574722,
1102
+ "learning_rate": 4.6877175561964684e-05,
1103
+ "loss": 0.12,
1104
+ "step": 690
1105
+ },
1106
+ {
1107
+ "epoch": 0.069,
1108
+ "eval_cos_sim": 0.8804360628128052,
1109
+ "eval_loss": 0.12052560474621725,
1110
+ "eval_runtime": 184.9748,
1111
+ "eval_samples_per_second": 21.625,
1112
+ "eval_steps_per_second": 0.341,
1113
+ "step": 690
1114
+ },
1115
+ {
1116
+ "epoch": 0.07,
1117
+ "grad_norm": 0.03007390908896923,
1118
+ "learning_rate": 1.272855232356e-05,
1119
+ "loss": 0.1204,
1120
+ "step": 700
1121
+ },
1122
+ {
1123
+ "epoch": 0.07,
1124
+ "eval_cos_sim": 0.8805059790611267,
1125
+ "eval_loss": 0.12045389034497214,
1126
+ "eval_runtime": 165.5078,
1127
+ "eval_samples_per_second": 24.168,
1128
+ "eval_steps_per_second": 0.381,
1129
+ "step": 700
1130
+ },
1131
+ {
1132
+ "epoch": 0.071,
1133
+ "grad_norm": 0.020742209628224373,
1134
+ "learning_rate": 4.6683449483917846e-05,
1135
+ "loss": 0.12,
1136
+ "step": 710
1137
+ },
1138
+ {
1139
+ "epoch": 0.071,
1140
+ "eval_cos_sim": 0.8806586861610413,
1141
+ "eval_loss": 0.12030088522183371,
1142
+ "eval_runtime": 172.1855,
1143
+ "eval_samples_per_second": 23.231,
1144
+ "eval_steps_per_second": 0.366,
1145
+ "step": 710
1146
+ },
1147
+ {
1148
+ "epoch": 0.072,
1149
+ "grad_norm": 0.023063719272613525,
1150
+ "learning_rate": 1.2386241478270652e-05,
1151
+ "loss": 0.1198,
1152
+ "step": 720
1153
+ },
1154
+ {
1155
+ "epoch": 0.072,
1156
+ "eval_cos_sim": 0.8805540204048157,
1157
+ "eval_loss": 0.12040612079846334,
1158
+ "eval_runtime": 166.7801,
1159
+ "eval_samples_per_second": 23.984,
1160
+ "eval_steps_per_second": 0.378,
1161
+ "step": 720
1162
+ },
1163
+ {
1164
+ "epoch": 0.073,
1165
+ "grad_norm": 0.027647124603390694,
1166
+ "learning_rate": 4.648431943567264e-05,
1167
+ "loss": 0.1205,
1168
+ "step": 730
1169
+ },
1170
+ {
1171
+ "epoch": 0.073,
1172
+ "eval_cos_sim": 0.8805664777755737,
1173
+ "eval_loss": 0.12039038088070822,
1174
+ "eval_runtime": 164.5025,
1175
+ "eval_samples_per_second": 24.316,
1176
+ "eval_steps_per_second": 0.383,
1177
+ "step": 730
1178
+ },
1179
+ {
1180
+ "epoch": 0.074,
1181
+ "grad_norm": 0.02208826318383217,
1182
+ "learning_rate": 1.204707424604792e-05,
1183
+ "loss": 0.1203,
1184
+ "step": 740
1185
+ },
1186
+ {
1187
+ "epoch": 0.074,
1188
+ "eval_cos_sim": 0.8805152177810669,
1189
+ "eval_loss": 0.12043928005444479,
1190
+ "eval_runtime": 178.0321,
1191
+ "eval_samples_per_second": 22.468,
1192
+ "eval_steps_per_second": 0.354,
1193
+ "step": 740
1194
+ },
1195
+ {
1196
+ "epoch": 0.075,
1197
+ "grad_norm": 0.021549325436353683,
1198
+ "learning_rate": 4.627983504461235e-05,
1199
+ "loss": 0.1196,
1200
+ "step": 750
1201
+ },
1202
+ {
1203
+ "epoch": 0.075,
1204
+ "eval_cos_sim": 0.8806087374687195,
1205
+ "eval_loss": 0.12034820940243673,
1206
+ "eval_runtime": 179.3479,
1207
+ "eval_samples_per_second": 22.303,
1208
+ "eval_steps_per_second": 0.351,
1209
+ "step": 750
1210
+ },
1211
+ {
1212
+ "epoch": 0.076,
1213
+ "grad_norm": 0.028022369369864464,
1214
+ "learning_rate": 1.1711135154477562e-05,
1215
+ "loss": 0.1207,
1216
+ "step": 760
1217
+ },
1218
+ {
1219
+ "epoch": 0.076,
1220
+ "eval_cos_sim": 0.8805749416351318,
1221
+ "eval_loss": 0.12038306286084127,
1222
+ "eval_runtime": 176.8723,
1223
+ "eval_samples_per_second": 22.615,
1224
+ "eval_steps_per_second": 0.356,
1225
+ "step": 760
1226
+ },
1227
+ {
1228
+ "epoch": 0.077,
1229
+ "grad_norm": 0.021017303690314293,
1230
+ "learning_rate": 4.607004727253391e-05,
1231
+ "loss": 0.12,
1232
+ "step": 770
1233
+ },
1234
+ {
1235
+ "epoch": 0.077,
1236
+ "eval_cos_sim": 0.8806374669075012,
1237
+ "eval_loss": 0.12032015850293112,
1238
+ "eval_runtime": 163.1866,
1239
+ "eval_samples_per_second": 24.512,
1240
+ "eval_steps_per_second": 0.386,
1241
+ "step": 770
1242
+ },
1243
+ {
1244
+ "epoch": 0.078,
1245
+ "grad_norm": 0.02246786840260029,
1246
+ "learning_rate": 1.1378507926623572e-05,
1247
+ "loss": 0.1199,
1248
+ "step": 780
1249
+ },
1250
+ {
1251
+ "epoch": 0.078,
1252
+ "eval_cos_sim": 0.8806332349777222,
1253
+ "eval_loss": 0.12032350780713034,
1254
+ "eval_runtime": 163.0562,
1255
+ "eval_samples_per_second": 24.531,
1256
+ "eval_steps_per_second": 0.386,
1257
+ "step": 780
1258
+ },
1259
+ {
1260
+ "epoch": 0.079,
1261
+ "grad_norm": 0.021708086133003235,
1262
+ "learning_rate": 4.585500840294793e-05,
1263
+ "loss": 0.1201,
1264
+ "step": 790
1265
+ },
1266
+ {
1267
+ "epoch": 0.079,
1268
+ "eval_cos_sim": 0.8807349801063538,
1269
+ "eval_loss": 0.12022322513806295,
1270
+ "eval_runtime": 173.5217,
1271
+ "eval_samples_per_second": 23.052,
1272
+ "eval_steps_per_second": 0.363,
1273
+ "step": 790
1274
+ },
1275
+ {
1276
+ "epoch": 0.08,
1277
+ "grad_norm": 0.034823887050151825,
1278
+ "learning_rate": 1.1049275460164102e-05,
1279
+ "loss": 0.1204,
1280
+ "step": 800
1281
+ },
1282
+ {
1283
+ "epoch": 0.08,
1284
+ "eval_cos_sim": 0.8806157112121582,
1285
+ "eval_loss": 0.12034289600598289,
1286
+ "eval_runtime": 163.4003,
1287
+ "eval_samples_per_second": 24.48,
1288
+ "eval_steps_per_second": 0.386,
1289
+ "step": 800
1290
+ },
1291
+ {
1292
+ "epoch": 0.081,
1293
+ "grad_norm": 0.02099907584488392,
1294
+ "learning_rate": 4.563477202804924e-05,
1295
+ "loss": 0.1203,
1296
+ "step": 810
1297
+ },
1298
+ {
1299
+ "epoch": 0.081,
1300
+ "eval_cos_sim": 0.8805558681488037,
1301
+ "eval_loss": 0.1204009156440444,
1302
+ "eval_runtime": 171.4073,
1303
+ "eval_samples_per_second": 23.336,
1304
+ "eval_steps_per_second": 0.368,
1305
+ "step": 810
1306
+ },
1307
+ {
1308
+ "epoch": 0.082,
1309
+ "grad_norm": 0.027718910947442055,
1310
+ "learning_rate": 1.0723519806732512e-05,
1311
+ "loss": 0.1206,
1312
+ "step": 820
1313
+ },
1314
+ {
1315
+ "epoch": 0.082,
1316
+ "eval_cos_sim": 0.8804323673248291,
1317
+ "eval_loss": 0.12052262928235007,
1318
+ "eval_runtime": 166.5171,
1319
+ "eval_samples_per_second": 24.022,
1320
+ "eval_steps_per_second": 0.378,
1321
+ "step": 820
1322
+ },
1323
+ {
1324
+ "epoch": 0.083,
1325
+ "grad_norm": 0.030117569491267204,
1326
+ "learning_rate": 4.540939303535997e-05,
1327
+ "loss": 0.1208,
1328
+ "step": 830
1329
+ },
1330
+ {
1331
+ "epoch": 0.083,
1332
+ "eval_cos_sim": 0.8806024193763733,
1333
+ "eval_loss": 0.12035309459912252,
1334
+ "eval_runtime": 172.9809,
1335
+ "eval_samples_per_second": 23.124,
1336
+ "eval_steps_per_second": 0.364,
1337
+ "step": 830
1338
+ },
1339
+ {
1340
+ "epoch": 0.084,
1341
+ "grad_norm": 0.025621019303798676,
1342
+ "learning_rate": 1.0401322151467458e-05,
1343
+ "loss": 0.1207,
1344
+ "step": 840
1345
+ },
1346
+ {
1347
+ "epoch": 0.084,
1348
+ "eval_cos_sim": 0.8805838823318481,
1349
+ "eval_loss": 0.12037316419827414,
1350
+ "eval_runtime": 161.0298,
1351
+ "eval_samples_per_second": 24.84,
1352
+ "eval_steps_per_second": 0.391,
1353
+ "step": 840
1354
+ },
1355
+ {
1356
+ "epoch": 0.085,
1357
+ "grad_norm": 0.037043701857328415,
1358
+ "learning_rate": 4.517892759404947e-05,
1359
+ "loss": 0.1192,
1360
+ "step": 850
1361
+ },
1362
+ {
1363
+ "epoch": 0.085,
1364
+ "eval_cos_sim": 0.8807790279388428,
1365
+ "eval_loss": 0.12017762710797263,
1366
+ "eval_runtime": 165.404,
1367
+ "eval_samples_per_second": 24.183,
1368
+ "eval_steps_per_second": 0.381,
1369
+ "step": 850
1370
+ },
1371
+ {
1372
+ "epoch": 0.086,
1373
+ "grad_norm": 0.024647973477840424,
1374
+ "learning_rate": 1.0082762792778497e-05,
1375
+ "loss": 0.12,
1376
+ "step": 860
1377
+ },
1378
+ {
1379
+ "epoch": 0.086,
1380
+ "eval_cos_sim": 0.8808472156524658,
1381
+ "eval_loss": 0.12010916282879783,
1382
+ "eval_runtime": 159.4806,
1383
+ "eval_samples_per_second": 25.081,
1384
+ "eval_steps_per_second": 0.395,
1385
+ "step": 860
1386
+ },
1387
+ {
1388
+ "epoch": 0.087,
1389
+ "grad_norm": 0.02787039987742901,
1390
+ "learning_rate": 4.494343314093799e-05,
1391
+ "loss": 0.1192,
1392
+ "step": 870
1393
+ },
1394
+ {
1395
+ "epoch": 0.087,
1396
+ "eval_cos_sim": 0.8806514143943787,
1397
+ "eval_loss": 0.12030471136319112,
1398
+ "eval_runtime": 167.7185,
1399
+ "eval_samples_per_second": 23.849,
1400
+ "eval_steps_per_second": 0.376,
1401
+ "step": 870
1402
+ },
1403
+ {
1404
+ "epoch": 0.088,
1405
+ "grad_norm": 0.027198661118745804,
1406
+ "learning_rate": 9.767921122337203e-06,
1407
+ "loss": 0.12,
1408
+ "step": 880
1409
+ },
1410
+ {
1411
+ "epoch": 0.088,
1412
+ "eval_cos_sim": 0.8806140422821045,
1413
+ "eval_loss": 0.12034183456646871,
1414
+ "eval_runtime": 164.5125,
1415
+ "eval_samples_per_second": 24.314,
1416
+ "eval_steps_per_second": 0.383,
1417
+ "step": 880
1418
+ },
1419
+ {
1420
+ "epoch": 0.089,
1421
+ "grad_norm": 0.020295780152082443,
1422
+ "learning_rate": 4.4702968366179995e-05,
1423
+ "loss": 0.121,
1424
+ "step": 890
1425
+ },
1426
+ {
1427
+ "epoch": 0.089,
1428
+ "eval_cos_sim": 0.8807177543640137,
1429
+ "eval_loss": 0.12023961307751609,
1430
+ "eval_runtime": 178.9283,
1431
+ "eval_samples_per_second": 22.355,
1432
+ "eval_steps_per_second": 0.352,
1433
+ "step": 890
1434
+ },
1435
+ {
1436
+ "epoch": 0.09,
1437
+ "grad_norm": 0.025682412087917328,
1438
+ "learning_rate": 9.456875605287963e-06,
1439
+ "loss": 0.1197,
1440
+ "step": 900
1441
+ },
1442
+ {
1443
+ "epoch": 0.09,
1444
+ "eval_cos_sim": 0.8808146715164185,
1445
+ "eval_loss": 0.12014213421093893,
1446
+ "eval_runtime": 162.1503,
1447
+ "eval_samples_per_second": 24.668,
1448
+ "eval_steps_per_second": 0.389,
1449
+ "step": 900
1450
+ },
1451
+ {
1452
+ "epoch": 0.091,
1453
+ "grad_norm": 0.0241321362555027,
1454
+ "learning_rate": 4.4457593198638266e-05,
1455
+ "loss": 0.1204,
1456
+ "step": 910
1457
+ },
1458
+ {
1459
+ "epoch": 0.091,
1460
+ "eval_cos_sim": 0.8806710243225098,
1461
+ "eval_loss": 0.12028472664105368,
1462
+ "eval_runtime": 170.9864,
1463
+ "eval_samples_per_second": 23.394,
1464
+ "eval_steps_per_second": 0.368,
1465
+ "step": 910
1466
+ },
1467
+ {
1468
+ "epoch": 0.092,
1469
+ "grad_norm": 0.03511843457818031,
1470
+ "learning_rate": 9.149703760693733e-06,
1471
+ "loss": 0.1204,
1472
+ "step": 920
1473
+ },
1474
+ {
1475
+ "epoch": 0.092,
1476
+ "eval_cos_sim": 0.8806130886077881,
1477
+ "eval_loss": 0.12034211971508932,
1478
+ "eval_runtime": 163.6776,
1479
+ "eval_samples_per_second": 24.438,
1480
+ "eval_steps_per_second": 0.385,
1481
+ "step": 920
1482
+ },
1483
+ {
1484
+ "epoch": 0.093,
1485
+ "grad_norm": 0.03159726411104202,
1486
+ "learning_rate": 4.420736879094911e-05,
1487
+ "loss": 0.1208,
1488
+ "step": 930
1489
+ },
1490
+ {
1491
+ "epoch": 0.093,
1492
+ "eval_cos_sim": 0.8807392716407776,
1493
+ "eval_loss": 0.12021884395825339,
1494
+ "eval_runtime": 172.0497,
1495
+ "eval_samples_per_second": 23.249,
1496
+ "eval_steps_per_second": 0.366,
1497
+ "step": 930
1498
+ },
1499
+ {
1500
+ "epoch": 0.094,
1501
+ "grad_norm": 0.02288082055747509,
1502
+ "learning_rate": 8.846482142219678e-06,
1503
+ "loss": 0.1206,
1504
+ "step": 940
1505
+ },
1506
+ {
1507
+ "epoch": 0.094,
1508
+ "eval_cos_sim": 0.8808532953262329,
1509
+ "eval_loss": 0.12010584022747946,
1510
+ "eval_runtime": 160.6917,
1511
+ "eval_samples_per_second": 24.892,
1512
+ "eval_steps_per_second": 0.392,
1513
+ "step": 940
1514
+ },
1515
+ {
1516
+ "epoch": 0.095,
1517
+ "grad_norm": 0.022692304104566574,
1518
+ "learning_rate": 4.395235750428116e-05,
1519
+ "loss": 0.1193,
1520
+ "step": 950
1521
+ },
1522
+ {
1523
+ "epoch": 0.095,
1524
+ "eval_cos_sim": 0.8807929158210754,
1525
+ "eval_loss": 0.1201639943336196,
1526
+ "eval_runtime": 165.9011,
1527
+ "eval_samples_per_second": 24.111,
1528
+ "eval_steps_per_second": 0.38,
1529
+ "step": 950
1530
+ },
1531
+ {
1532
+ "epoch": 0.096,
1533
+ "grad_norm": 0.02069064788520336,
1534
+ "learning_rate": 8.547286319049193e-06,
1535
+ "loss": 0.1204,
1536
+ "step": 960
1537
+ },
1538
+ {
1539
+ "epoch": 0.096,
1540
+ "eval_cos_sim": 0.8806983232498169,
1541
+ "eval_loss": 0.12025791980969382,
1542
+ "eval_runtime": 163.2536,
1543
+ "eval_samples_per_second": 24.502,
1544
+ "eval_steps_per_second": 0.386,
1545
+ "step": 960
1546
+ },
1547
+ {
1548
+ "epoch": 0.097,
1549
+ "grad_norm": 0.024405937641859055,
1550
+ "learning_rate": 4.369262289279257e-05,
1551
+ "loss": 0.12,
1552
+ "step": 970
1553
+ },
1554
+ {
1555
+ "epoch": 0.097,
1556
+ "eval_cos_sim": 0.8808521628379822,
1557
+ "eval_loss": 0.1201059584830947,
1558
+ "eval_runtime": 161.1408,
1559
+ "eval_samples_per_second": 24.823,
1560
+ "eval_steps_per_second": 0.391,
1561
+ "step": 970
1562
+ }
1563
+ ],
1564
+ "logging_steps": 10,
1565
+ "max_steps": 10000,
1566
+ "num_input_tokens_seen": 0,
1567
+ "num_train_epochs": 9223372036854775807,
1568
+ "save_steps": 10,
1569
+ "stateful_callbacks": {
1570
+ "TrainerControl": {
1571
+ "args": {
1572
+ "should_epoch_stop": false,
1573
+ "should_evaluate": false,
1574
+ "should_log": false,
1575
+ "should_save": true,
1576
+ "should_training_stop": false
1577
+ },
1578
+ "attributes": {}
1579
+ }
1580
+ },
1581
+ "total_flos": 0.0,
1582
+ "train_batch_size": 440,
1583
+ "trial_name": null,
1584
+ "trial_params": null
1585
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5ffeff2858a823619b1d15e1ef15f56c829886ecd33d46ec591b342d231b711
3
+ size 5176