ganghyeon commited on
Commit
cc5d710
1 Parent(s): 866b143

Upload fine-tuned Llama model for order analysis

Browse files
adapter_config.json CHANGED
@@ -21,12 +21,12 @@
21
  "revision": null,
22
  "target_modules": [
23
  "q_proj",
 
24
  "down_proj",
25
- "k_proj",
26
  "up_proj",
27
- "o_proj",
28
  "v_proj",
29
- "gate_proj"
 
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
21
  "revision": null,
22
  "target_modules": [
23
  "q_proj",
24
+ "o_proj",
25
  "down_proj",
 
26
  "up_proj",
 
27
  "v_proj",
28
+ "gate_proj",
29
+ "k_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:db10aadcef7e79bd7c1845b1e4a5d7e3a479ae5afbcabbda51dff0170fbb63b9
3
  size 22573704
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f387711e63c8ab74a59664fd7c1a08083964c4f023e8f9c181ad0376b5caf698
3
  size 22573704
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0fad6449d5850fc23d58409b38ae7dc42f3bd1ad94828621f7d88b3244f4d267
3
  size 45276986
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e545c92e8a29e73b2a708438df82a773c90ac81fb95e3d77084344d71129ce4c
3
  size 45276986
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4f2ca99e2c5f8daa141e4bca82e3ff3f06813039feda2777d1c1eaa7b0d89f33
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:134bd3caf7fa7a05a76100cdc2365343eb2f59dc0c82afde6756800f9549f1f9
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:20715e03e6f2f2744bc1fba8c744854dc7b82c89d8238b57676982792c428010
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d154ed8833a95231b13495341f4d5eda62192ea0a3a9aa2af2ce186e4e571f34
3
  size 1064
trainer_state.json CHANGED
@@ -3,210 +3,504 @@
3
  "best_model_checkpoint": null,
4
  "epoch": 1.0,
5
  "eval_steps": 500,
6
- "global_step": 560,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.03571428571428571,
13
- "grad_norm": 1.8447134494781494,
14
- "learning_rate": 3.571428571428572e-05,
15
- "loss": 2.5344,
16
  "step": 20
17
  },
18
  {
19
- "epoch": 0.07142857142857142,
20
- "grad_norm": 1.3903896808624268,
21
- "learning_rate": 7.142857142857143e-05,
22
- "loss": 1.8578,
23
  "step": 40
24
  },
25
  {
26
- "epoch": 0.10714285714285714,
27
- "grad_norm": 0.9996151328086853,
28
- "learning_rate": 9.920634920634922e-05,
29
- "loss": 1.161,
30
  "step": 60
31
  },
32
  {
33
- "epoch": 0.14285714285714285,
34
- "grad_norm": 0.7677631974220276,
35
- "learning_rate": 9.523809523809524e-05,
36
- "loss": 0.9116,
37
  "step": 80
38
  },
39
  {
40
- "epoch": 0.17857142857142858,
41
- "grad_norm": 1.040696144104004,
42
- "learning_rate": 9.126984126984128e-05,
43
- "loss": 0.8243,
44
  "step": 100
45
  },
46
  {
47
- "epoch": 0.21428571428571427,
48
- "grad_norm": 0.970762312412262,
49
- "learning_rate": 8.730158730158731e-05,
50
- "loss": 0.7555,
51
  "step": 120
52
  },
53
  {
54
- "epoch": 0.25,
55
- "grad_norm": 1.0803948640823364,
56
- "learning_rate": 8.333333333333334e-05,
57
- "loss": 0.6679,
58
  "step": 140
59
  },
60
  {
61
- "epoch": 0.2857142857142857,
62
- "grad_norm": 1.0338200330734253,
63
- "learning_rate": 7.936507936507937e-05,
64
- "loss": 0.6522,
65
  "step": 160
66
  },
67
  {
68
- "epoch": 0.32142857142857145,
69
- "grad_norm": 0.8714532256126404,
70
- "learning_rate": 7.53968253968254e-05,
71
- "loss": 0.6209,
72
  "step": 180
73
  },
74
  {
75
- "epoch": 0.35714285714285715,
76
- "grad_norm": 0.9677976965904236,
77
- "learning_rate": 7.142857142857143e-05,
78
- "loss": 0.6242,
79
  "step": 200
80
  },
81
  {
82
- "epoch": 0.39285714285714285,
83
- "grad_norm": 0.962899386882782,
84
- "learning_rate": 6.746031746031747e-05,
85
- "loss": 0.5844,
86
  "step": 220
87
  },
88
  {
89
- "epoch": 0.42857142857142855,
90
- "grad_norm": 1.0176018476486206,
91
- "learning_rate": 6.349206349206349e-05,
92
- "loss": 0.5915,
93
  "step": 240
94
  },
95
  {
96
- "epoch": 0.4642857142857143,
97
- "grad_norm": 0.9592286348342896,
98
- "learning_rate": 5.9523809523809524e-05,
99
- "loss": 0.5948,
100
  "step": 260
101
  },
102
  {
103
- "epoch": 0.5,
104
- "grad_norm": 0.9406882524490356,
105
- "learning_rate": 5.555555555555556e-05,
106
- "loss": 0.5557,
107
  "step": 280
108
  },
109
  {
110
- "epoch": 0.5357142857142857,
111
- "grad_norm": 1.0566672086715698,
112
- "learning_rate": 5.158730158730159e-05,
113
- "loss": 0.5596,
114
  "step": 300
115
  },
116
  {
117
- "epoch": 0.5714285714285714,
118
- "grad_norm": 1.0166610479354858,
119
- "learning_rate": 4.761904761904762e-05,
120
- "loss": 0.5514,
121
  "step": 320
122
  },
123
  {
124
- "epoch": 0.6071428571428571,
125
- "grad_norm": 1.110986590385437,
126
- "learning_rate": 4.3650793650793655e-05,
127
- "loss": 0.5523,
128
  "step": 340
129
  },
130
  {
131
- "epoch": 0.6428571428571429,
132
- "grad_norm": 1.1951007843017578,
133
- "learning_rate": 3.968253968253968e-05,
134
- "loss": 0.5395,
135
  "step": 360
136
  },
137
  {
138
- "epoch": 0.6785714285714286,
139
- "grad_norm": 1.0911144018173218,
140
- "learning_rate": 3.571428571428572e-05,
141
- "loss": 0.548,
142
  "step": 380
143
  },
144
  {
145
- "epoch": 0.7142857142857143,
146
- "grad_norm": 1.1718319654464722,
147
- "learning_rate": 3.1746031746031745e-05,
148
- "loss": 0.5188,
149
  "step": 400
150
  },
151
  {
152
- "epoch": 0.75,
153
- "grad_norm": 0.9721041917800903,
154
- "learning_rate": 2.777777777777778e-05,
155
- "loss": 0.5081,
156
  "step": 420
157
  },
158
  {
159
- "epoch": 0.7857142857142857,
160
- "grad_norm": 1.103784203529358,
161
- "learning_rate": 2.380952380952381e-05,
162
- "loss": 0.5183,
163
  "step": 440
164
  },
165
  {
166
- "epoch": 0.8214285714285714,
167
- "grad_norm": 1.1778147220611572,
168
- "learning_rate": 1.984126984126984e-05,
169
- "loss": 0.5064,
170
  "step": 460
171
  },
172
  {
173
- "epoch": 0.8571428571428571,
174
- "grad_norm": 1.0783812999725342,
175
- "learning_rate": 1.5873015873015872e-05,
176
- "loss": 0.527,
177
  "step": 480
178
  },
179
  {
180
- "epoch": 0.8928571428571429,
181
- "grad_norm": 1.0787134170532227,
182
- "learning_rate": 1.1904761904761905e-05,
183
- "loss": 0.5132,
184
  "step": 500
185
  },
186
  {
187
- "epoch": 0.9285714285714286,
188
- "grad_norm": 1.0964124202728271,
189
- "learning_rate": 7.936507936507936e-06,
190
- "loss": 0.4971,
191
  "step": 520
192
  },
193
  {
194
- "epoch": 0.9642857142857143,
195
- "grad_norm": 1.1419587135314941,
196
- "learning_rate": 3.968253968253968e-06,
197
- "loss": 0.5368,
198
  "step": 540
199
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
  {
201
  "epoch": 1.0,
202
- "grad_norm": 1.0203648805618286,
203
  "learning_rate": 0.0,
204
- "loss": 0.501,
205
- "step": 560
206
  }
207
  ],
208
  "logging_steps": 20,
209
- "max_steps": 560,
210
  "num_input_tokens_seen": 0,
211
  "num_train_epochs": 1,
212
  "save_steps": 50,
@@ -223,7 +517,7 @@
223
  }
224
  },
225
  "total_flos": 8419093040332800.0,
226
- "train_batch_size": 5,
227
  "trial_name": null,
228
  "trial_params": null
229
  }
 
3
  "best_model_checkpoint": null,
4
  "epoch": 1.0,
5
  "eval_steps": 500,
6
+ "global_step": 1400,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.014285714285714285,
13
+ "grad_norm": 2.279883623123169,
14
+ "learning_rate": 1.4285714285714285e-05,
15
+ "loss": 2.5982,
16
  "step": 20
17
  },
18
  {
19
+ "epoch": 0.02857142857142857,
20
+ "grad_norm": 1.5012547969818115,
21
+ "learning_rate": 2.857142857142857e-05,
22
+ "loss": 2.3142,
23
  "step": 40
24
  },
25
  {
26
+ "epoch": 0.04285714285714286,
27
+ "grad_norm": 1.7552474737167358,
28
+ "learning_rate": 4.2857142857142856e-05,
29
+ "loss": 1.7833,
30
  "step": 60
31
  },
32
  {
33
+ "epoch": 0.05714285714285714,
34
+ "grad_norm": 1.8790605068206787,
35
+ "learning_rate": 5.714285714285714e-05,
36
+ "loss": 1.2697,
37
  "step": 80
38
  },
39
  {
40
+ "epoch": 0.07142857142857142,
41
+ "grad_norm": 1.6815859079360962,
42
+ "learning_rate": 7.142857142857143e-05,
43
+ "loss": 1.049,
44
  "step": 100
45
  },
46
  {
47
+ "epoch": 0.08571428571428572,
48
+ "grad_norm": 1.561714768409729,
49
+ "learning_rate": 8.571428571428571e-05,
50
+ "loss": 0.9491,
51
  "step": 120
52
  },
53
  {
54
+ "epoch": 0.1,
55
+ "grad_norm": 1.4625489711761475,
56
+ "learning_rate": 0.0001,
57
+ "loss": 0.8847,
58
  "step": 140
59
  },
60
  {
61
+ "epoch": 0.11428571428571428,
62
+ "grad_norm": 2.1226258277893066,
63
+ "learning_rate": 9.841269841269841e-05,
64
+ "loss": 0.8167,
65
  "step": 160
66
  },
67
  {
68
+ "epoch": 0.12857142857142856,
69
+ "grad_norm": 1.3709640502929688,
70
+ "learning_rate": 9.682539682539682e-05,
71
+ "loss": 0.7321,
72
  "step": 180
73
  },
74
  {
75
+ "epoch": 0.14285714285714285,
76
+ "grad_norm": 1.2312499284744263,
77
+ "learning_rate": 9.523809523809524e-05,
78
+ "loss": 0.693,
79
  "step": 200
80
  },
81
  {
82
+ "epoch": 0.15714285714285714,
83
+ "grad_norm": 1.4416557550430298,
84
+ "learning_rate": 9.365079365079366e-05,
85
+ "loss": 0.6389,
86
  "step": 220
87
  },
88
  {
89
+ "epoch": 0.17142857142857143,
90
+ "grad_norm": 1.7498096227645874,
91
+ "learning_rate": 9.206349206349206e-05,
92
+ "loss": 0.6421,
93
  "step": 240
94
  },
95
  {
96
+ "epoch": 0.18571428571428572,
97
+ "grad_norm": 1.6708226203918457,
98
+ "learning_rate": 9.047619047619048e-05,
99
+ "loss": 0.6309,
100
  "step": 260
101
  },
102
  {
103
+ "epoch": 0.2,
104
+ "grad_norm": 1.7032530307769775,
105
+ "learning_rate": 8.888888888888889e-05,
106
+ "loss": 0.6388,
107
  "step": 280
108
  },
109
  {
110
+ "epoch": 0.21428571428571427,
111
+ "grad_norm": 1.1614326238632202,
112
+ "learning_rate": 8.730158730158731e-05,
113
+ "loss": 0.5888,
114
  "step": 300
115
  },
116
  {
117
+ "epoch": 0.22857142857142856,
118
+ "grad_norm": 1.7418193817138672,
119
+ "learning_rate": 8.571428571428571e-05,
120
+ "loss": 0.5878,
121
  "step": 320
122
  },
123
  {
124
+ "epoch": 0.24285714285714285,
125
+ "grad_norm": 1.3387174606323242,
126
+ "learning_rate": 8.412698412698413e-05,
127
+ "loss": 0.5962,
128
  "step": 340
129
  },
130
  {
131
+ "epoch": 0.2571428571428571,
132
+ "grad_norm": 1.1994811296463013,
133
+ "learning_rate": 8.253968253968255e-05,
134
+ "loss": 0.6093,
135
  "step": 360
136
  },
137
  {
138
+ "epoch": 0.2714285714285714,
139
+ "grad_norm": 1.5204330682754517,
140
+ "learning_rate": 8.095238095238096e-05,
141
+ "loss": 0.6045,
142
  "step": 380
143
  },
144
  {
145
+ "epoch": 0.2857142857142857,
146
+ "grad_norm": 1.2687711715698242,
147
+ "learning_rate": 7.936507936507937e-05,
148
+ "loss": 0.5934,
149
  "step": 400
150
  },
151
  {
152
+ "epoch": 0.3,
153
+ "grad_norm": 1.4332380294799805,
154
+ "learning_rate": 7.777777777777778e-05,
155
+ "loss": 0.5883,
156
  "step": 420
157
  },
158
  {
159
+ "epoch": 0.3142857142857143,
160
+ "grad_norm": 1.5756443738937378,
161
+ "learning_rate": 7.619047619047618e-05,
162
+ "loss": 0.5605,
163
  "step": 440
164
  },
165
  {
166
+ "epoch": 0.32857142857142857,
167
+ "grad_norm": 1.405213713645935,
168
+ "learning_rate": 7.460317460317461e-05,
169
+ "loss": 0.5993,
170
  "step": 460
171
  },
172
  {
173
+ "epoch": 0.34285714285714286,
174
+ "grad_norm": 1.480230450630188,
175
+ "learning_rate": 7.301587301587302e-05,
176
+ "loss": 0.5896,
177
  "step": 480
178
  },
179
  {
180
+ "epoch": 0.35714285714285715,
181
+ "grad_norm": 1.472406029701233,
182
+ "learning_rate": 7.142857142857143e-05,
183
+ "loss": 0.5661,
184
  "step": 500
185
  },
186
  {
187
+ "epoch": 0.37142857142857144,
188
+ "grad_norm": 1.408607006072998,
189
+ "learning_rate": 6.984126984126984e-05,
190
+ "loss": 0.545,
191
  "step": 520
192
  },
193
  {
194
+ "epoch": 0.38571428571428573,
195
+ "grad_norm": 1.3194152116775513,
196
+ "learning_rate": 6.825396825396825e-05,
197
+ "loss": 0.5366,
198
  "step": 540
199
  },
200
+ {
201
+ "epoch": 0.4,
202
+ "grad_norm": 1.5078984498977661,
203
+ "learning_rate": 6.666666666666667e-05,
204
+ "loss": 0.5236,
205
+ "step": 560
206
+ },
207
+ {
208
+ "epoch": 0.4142857142857143,
209
+ "grad_norm": 1.3387917280197144,
210
+ "learning_rate": 6.507936507936509e-05,
211
+ "loss": 0.5545,
212
+ "step": 580
213
+ },
214
+ {
215
+ "epoch": 0.42857142857142855,
216
+ "grad_norm": 1.1835085153579712,
217
+ "learning_rate": 6.349206349206349e-05,
218
+ "loss": 0.563,
219
+ "step": 600
220
+ },
221
+ {
222
+ "epoch": 0.44285714285714284,
223
+ "grad_norm": 1.424862027168274,
224
+ "learning_rate": 6.19047619047619e-05,
225
+ "loss": 0.5369,
226
+ "step": 620
227
+ },
228
+ {
229
+ "epoch": 0.45714285714285713,
230
+ "grad_norm": 1.3369919061660767,
231
+ "learning_rate": 6.0317460317460316e-05,
232
+ "loss": 0.576,
233
+ "step": 640
234
+ },
235
+ {
236
+ "epoch": 0.4714285714285714,
237
+ "grad_norm": 1.2523393630981445,
238
+ "learning_rate": 5.873015873015873e-05,
239
+ "loss": 0.5245,
240
+ "step": 660
241
+ },
242
+ {
243
+ "epoch": 0.4857142857142857,
244
+ "grad_norm": 1.6725609302520752,
245
+ "learning_rate": 5.714285714285714e-05,
246
+ "loss": 0.5047,
247
+ "step": 680
248
+ },
249
+ {
250
+ "epoch": 0.5,
251
+ "grad_norm": 1.3288273811340332,
252
+ "learning_rate": 5.555555555555556e-05,
253
+ "loss": 0.5396,
254
+ "step": 700
255
+ },
256
+ {
257
+ "epoch": 0.5142857142857142,
258
+ "grad_norm": 1.492069125175476,
259
+ "learning_rate": 5.396825396825397e-05,
260
+ "loss": 0.5099,
261
+ "step": 720
262
+ },
263
+ {
264
+ "epoch": 0.5285714285714286,
265
+ "grad_norm": 1.508617639541626,
266
+ "learning_rate": 5.2380952380952384e-05,
267
+ "loss": 0.503,
268
+ "step": 740
269
+ },
270
+ {
271
+ "epoch": 0.5428571428571428,
272
+ "grad_norm": 1.6115648746490479,
273
+ "learning_rate": 5.0793650793650794e-05,
274
+ "loss": 0.5571,
275
+ "step": 760
276
+ },
277
+ {
278
+ "epoch": 0.5571428571428572,
279
+ "grad_norm": 1.4812785387039185,
280
+ "learning_rate": 4.9206349206349204e-05,
281
+ "loss": 0.5036,
282
+ "step": 780
283
+ },
284
+ {
285
+ "epoch": 0.5714285714285714,
286
+ "grad_norm": 1.555457353591919,
287
+ "learning_rate": 4.761904761904762e-05,
288
+ "loss": 0.518,
289
+ "step": 800
290
+ },
291
+ {
292
+ "epoch": 0.5857142857142857,
293
+ "grad_norm": 1.6743320226669312,
294
+ "learning_rate": 4.603174603174603e-05,
295
+ "loss": 0.523,
296
+ "step": 820
297
+ },
298
+ {
299
+ "epoch": 0.6,
300
+ "grad_norm": 1.6365365982055664,
301
+ "learning_rate": 4.4444444444444447e-05,
302
+ "loss": 0.5112,
303
+ "step": 840
304
+ },
305
+ {
306
+ "epoch": 0.6142857142857143,
307
+ "grad_norm": 1.4804445505142212,
308
+ "learning_rate": 4.2857142857142856e-05,
309
+ "loss": 0.5177,
310
+ "step": 860
311
+ },
312
+ {
313
+ "epoch": 0.6285714285714286,
314
+ "grad_norm": 1.5929114818572998,
315
+ "learning_rate": 4.126984126984127e-05,
316
+ "loss": 0.4895,
317
+ "step": 880
318
+ },
319
+ {
320
+ "epoch": 0.6428571428571429,
321
+ "grad_norm": 1.512065052986145,
322
+ "learning_rate": 3.968253968253968e-05,
323
+ "loss": 0.5158,
324
+ "step": 900
325
+ },
326
+ {
327
+ "epoch": 0.6571428571428571,
328
+ "grad_norm": 1.5385123491287231,
329
+ "learning_rate": 3.809523809523809e-05,
330
+ "loss": 0.5188,
331
+ "step": 920
332
+ },
333
+ {
334
+ "epoch": 0.6714285714285714,
335
+ "grad_norm": 1.7010993957519531,
336
+ "learning_rate": 3.650793650793651e-05,
337
+ "loss": 0.5039,
338
+ "step": 940
339
+ },
340
+ {
341
+ "epoch": 0.6857142857142857,
342
+ "grad_norm": 1.4756510257720947,
343
+ "learning_rate": 3.492063492063492e-05,
344
+ "loss": 0.5004,
345
+ "step": 960
346
+ },
347
+ {
348
+ "epoch": 0.7,
349
+ "grad_norm": 1.407616376876831,
350
+ "learning_rate": 3.3333333333333335e-05,
351
+ "loss": 0.4947,
352
+ "step": 980
353
+ },
354
+ {
355
+ "epoch": 0.7142857142857143,
356
+ "grad_norm": 1.376063346862793,
357
+ "learning_rate": 3.1746031746031745e-05,
358
+ "loss": 0.4797,
359
+ "step": 1000
360
+ },
361
+ {
362
+ "epoch": 0.7285714285714285,
363
+ "grad_norm": 1.6061830520629883,
364
+ "learning_rate": 3.0158730158730158e-05,
365
+ "loss": 0.4872,
366
+ "step": 1020
367
+ },
368
+ {
369
+ "epoch": 0.7428571428571429,
370
+ "grad_norm": 1.4005217552185059,
371
+ "learning_rate": 2.857142857142857e-05,
372
+ "loss": 0.4648,
373
+ "step": 1040
374
+ },
375
+ {
376
+ "epoch": 0.7571428571428571,
377
+ "grad_norm": 1.4235899448394775,
378
+ "learning_rate": 2.6984126984126984e-05,
379
+ "loss": 0.463,
380
+ "step": 1060
381
+ },
382
+ {
383
+ "epoch": 0.7714285714285715,
384
+ "grad_norm": 1.210481882095337,
385
+ "learning_rate": 2.5396825396825397e-05,
386
+ "loss": 0.4728,
387
+ "step": 1080
388
+ },
389
+ {
390
+ "epoch": 0.7857142857142857,
391
+ "grad_norm": 1.6099470853805542,
392
+ "learning_rate": 2.380952380952381e-05,
393
+ "loss": 0.485,
394
+ "step": 1100
395
+ },
396
+ {
397
+ "epoch": 0.8,
398
+ "grad_norm": 1.6083734035491943,
399
+ "learning_rate": 2.2222222222222223e-05,
400
+ "loss": 0.4594,
401
+ "step": 1120
402
+ },
403
+ {
404
+ "epoch": 0.8142857142857143,
405
+ "grad_norm": 1.350246548652649,
406
+ "learning_rate": 2.0634920634920636e-05,
407
+ "loss": 0.4763,
408
+ "step": 1140
409
+ },
410
+ {
411
+ "epoch": 0.8285714285714286,
412
+ "grad_norm": 1.3000835180282593,
413
+ "learning_rate": 1.9047619047619046e-05,
414
+ "loss": 0.4705,
415
+ "step": 1160
416
+ },
417
+ {
418
+ "epoch": 0.8428571428571429,
419
+ "grad_norm": 1.2059348821640015,
420
+ "learning_rate": 1.746031746031746e-05,
421
+ "loss": 0.4876,
422
+ "step": 1180
423
+ },
424
+ {
425
+ "epoch": 0.8571428571428571,
426
+ "grad_norm": 1.3652459383010864,
427
+ "learning_rate": 1.5873015873015872e-05,
428
+ "loss": 0.4919,
429
+ "step": 1200
430
+ },
431
+ {
432
+ "epoch": 0.8714285714285714,
433
+ "grad_norm": 1.451910376548767,
434
+ "learning_rate": 1.4285714285714285e-05,
435
+ "loss": 0.463,
436
+ "step": 1220
437
+ },
438
+ {
439
+ "epoch": 0.8857142857142857,
440
+ "grad_norm": 1.4704546928405762,
441
+ "learning_rate": 1.2698412698412699e-05,
442
+ "loss": 0.4768,
443
+ "step": 1240
444
+ },
445
+ {
446
+ "epoch": 0.9,
447
+ "grad_norm": 1.5009324550628662,
448
+ "learning_rate": 1.1111111111111112e-05,
449
+ "loss": 0.4516,
450
+ "step": 1260
451
+ },
452
+ {
453
+ "epoch": 0.9142857142857143,
454
+ "grad_norm": 1.5519717931747437,
455
+ "learning_rate": 9.523809523809523e-06,
456
+ "loss": 0.4517,
457
+ "step": 1280
458
+ },
459
+ {
460
+ "epoch": 0.9285714285714286,
461
+ "grad_norm": 1.5606343746185303,
462
+ "learning_rate": 7.936507936507936e-06,
463
+ "loss": 0.4732,
464
+ "step": 1300
465
+ },
466
+ {
467
+ "epoch": 0.9428571428571428,
468
+ "grad_norm": 1.3639295101165771,
469
+ "learning_rate": 6.349206349206349e-06,
470
+ "loss": 0.4791,
471
+ "step": 1320
472
+ },
473
+ {
474
+ "epoch": 0.9571428571428572,
475
+ "grad_norm": 1.6867655515670776,
476
+ "learning_rate": 4.7619047619047615e-06,
477
+ "loss": 0.4968,
478
+ "step": 1340
479
+ },
480
+ {
481
+ "epoch": 0.9714285714285714,
482
+ "grad_norm": 1.4185600280761719,
483
+ "learning_rate": 3.1746031746031746e-06,
484
+ "loss": 0.4902,
485
+ "step": 1360
486
+ },
487
+ {
488
+ "epoch": 0.9857142857142858,
489
+ "grad_norm": 1.7370814085006714,
490
+ "learning_rate": 1.5873015873015873e-06,
491
+ "loss": 0.4564,
492
+ "step": 1380
493
+ },
494
  {
495
  "epoch": 1.0,
496
+ "grad_norm": 1.6167285442352295,
497
  "learning_rate": 0.0,
498
+ "loss": 0.4652,
499
+ "step": 1400
500
  }
501
  ],
502
  "logging_steps": 20,
503
+ "max_steps": 1400,
504
  "num_input_tokens_seen": 0,
505
  "num_train_epochs": 1,
506
  "save_steps": 50,
 
517
  }
518
  },
519
  "total_flos": 8419093040332800.0,
520
+ "train_batch_size": 2,
521
  "trial_name": null,
522
  "trial_params": null
523
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a7dff0390e1e03cec6abf66e8ef1403103f88f4c39cff41b36116dc56386757a
3
  size 5560
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39f98817715f279a5f40c38dd70904c7137598047d2e35bac717a82d7f015fd1
3
  size 5560