Upload experiments/2023-02-21-b0010c97cb1f06debca911602ea05b6ff85a8270fb9487d27b3d52eb4eb29e9e with huggingface_hub
Browse files
experiments/2023-02-21-b0010c97cb1f06debca911602ea05b6ff85a8270fb9487d27b3d52eb4eb29e9e/output.log
ADDED
@@ -0,0 +1,543 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
0 |
0%| | 0/430 [00:00<?, ?it/s]You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
|
|
|
1 |
0%| | 1/430 [01:21<9:41:43, 81.36s/it]
|
2 |
|
|
|
3 |
0%| | 1/430 [01:21<9:41:43, 81.36s/it]
|
4 |
0%| | 2/430 [02:41<9:36:10, 80.77s/it]
|
5 |
1%| | 3/430 [04:02<9:33:33, 80.59s/it]
|
6 |
1%| | 4/430 [05:22<9:31:45, 80.53s/it]
|
7 |
1%| | 5/430 [06:42<9:30:07, 80.49s/it]
|
8 |
|
|
|
9 |
1%| | 5/430 [06:42<9:30:07, 80.49s/it]
|
10 |
1%|β | 6/430 [08:03<9:28:34, 80.46s/it]
|
11 |
2%|β | 7/430 [09:24<9:28:07, 80.58s/it]
|
12 |
2%|β | 8/430 [10:44<9:26:23, 80.53s/it]
|
13 |
2%|β | 9/430 [12:05<9:24:58, 80.52s/it]
|
14 |
2%|β | 10/430 [13:25<9:23:23, 80.49s/it]
|
15 |
|
|
|
16 |
2%|β | 10/430 [13:25<9:23:23, 80.49s/it]
|
17 |
3%|β | 11/430 [14:45<9:22:02, 80.48s/it]
|
18 |
3%|β | 12/430 [16:06<9:20:48, 80.50s/it]
|
19 |
3%|β | 13/430 [17:26<9:19:19, 80.48s/it]
|
20 |
3%|β | 14/430 [18:47<9:18:17, 80.52s/it]
|
21 |
3%|β | 15/430 [20:08<9:16:48, 80.50s/it]
|
22 |
|
|
|
23 |
3%|β | 15/430 [20:08<9:16:48, 80.50s/it]
|
24 |
4%|β | 16/430 [21:28<9:15:34, 80.52s/it]
|
25 |
4%|β | 17/430 [22:49<9:14:08, 80.51s/it]
|
26 |
4%|β | 18/430 [24:09<9:12:33, 80.47s/it]
|
27 |
4%|β | 19/430 [25:30<9:11:22, 80.49s/it]
|
28 |
5%|β | 20/430 [26:50<9:09:51, 80.47s/it]
|
29 |
|
|
|
30 |
5%|β | 20/430 [26:50<9:09:51, 80.47s/it]
|
31 |
5%|β | 21/430 [28:10<9:08:25, 80.45s/it]
|
32 |
5%|β | 22/430 [29:31<9:08:02, 80.59s/it]
|
33 |
5%|β | 23/430 [30:52<9:06:16, 80.53s/it]
|
34 |
6%|β | 24/430 [32:12<9:04:45, 80.51s/it]
|
35 |
6%|β | 25/430 [33:32<9:03:06, 80.46s/it]
|
36 |
|
|
|
37 |
6%|β | 25/430 [33:32<9:03:06, 80.46s/it]
|
38 |
6%|β | 26/430 [34:53<9:01:27, 80.42s/it]
|
39 |
6%|β | 27/430 [36:13<8:59:58, 80.39s/it]
|
40 |
7%|β | 28/430 [37:33<8:58:26, 80.36s/it]
|
41 |
7%|β | 29/430 [38:54<8:57:43, 80.46s/it]
|
42 |
7%|β | 30/430 [40:14<8:56:19, 80.45s/it]
|
43 |
|
|
|
44 |
7%|β | 30/430 [40:14<8:56:19, 80.45s/it]
|
45 |
7%|β | 31/430 [41:35<8:55:05, 80.46s/it]
|
46 |
7%|β | 32/430 [42:55<8:53:43, 80.46s/it]
|
47 |
8%|β | 33/430 [44:16<8:52:17, 80.45s/it]
|
48 |
8%|β | 34/430 [45:36<8:50:56, 80.45s/it]
|
49 |
8%|β | 35/430 [46:57<8:49:37, 80.45s/it]
|
50 |
|
|
|
51 |
8%|β | 35/430 [46:57<8:49:37, 80.45s/it]
|
52 |
8%|β | 36/430 [48:17<8:48:22, 80.46s/it]
|
53 |
9%|β | 37/430 [49:38<8:46:59, 80.46s/it]
|
54 |
9%|β | 38/430 [50:58<8:45:38, 80.46s/it]
|
55 |
9%|β | 39/430 [52:19<8:44:24, 80.47s/it]
|
56 |
9%|β | 40/430 [53:39<8:43:03, 80.47s/it]
|
57 |
|
|
|
58 |
9%|β | 40/430 [53:39<8:43:03, 80.47s/it]
|
59 |
10%|β | 41/430 [55:00<8:42:48, 80.64s/it]
|
60 |
10%|β | 42/430 [56:21<8:41:07, 80.59s/it]
|
61 |
10%|β | 43/430 [57:41<8:39:31, 80.55s/it]
|
62 |
10%|β | 44/430 [59:50<10:12:21, 95.19s/it]
|
63 |
10%|β | 45/430 [1:01:11<9:42:25, 90.77s/it]
|
64 |
|
|
|
65 |
10%|β | 45/430 [1:01:11<9:42:25, 90.77s/it]
|
66 |
11%|β | 46/430 [1:02:31<9:21:05, 87.67s/it]
|
67 |
11%|β | 47/430 [1:03:52<9:06:42, 85.65s/it]
|
68 |
11%|β | 48/430 [1:05:13<8:55:45, 84.15s/it]
|
69 |
11%|ββ | 49/430 [1:06:33<8:47:15, 83.03s/it]
|
70 |
12%|ββ | 50/430 [1:07:54<8:41:00, 82.27s/it]
|
71 |
|
|
|
72 |
12%|ββ | 50/430 [1:07:54<8:41:00, 82.27s/it]
|
73 |
12%|ββ | 51/430 [1:09:14<8:36:14, 81.73s/it]
|
74 |
12%|ββ | 52/430 [1:10:35<8:32:33, 81.36s/it]
|
75 |
12%|ββ | 53/430 [1:11:55<8:30:00, 81.17s/it]
|
76 |
13%|ββ | 54/430 [1:13:16<8:27:22, 80.96s/it]
|
77 |
13%|ββ | 55/430 [1:14:37<8:25:21, 80.86s/it]
|
78 |
|
|
|
79 |
13%|ββ | 55/430 [1:14:37<8:25:21, 80.86s/it]
|
80 |
13%|ββ | 56/430 [1:15:57<8:23:29, 80.77s/it]
|
81 |
13%|ββ | 57/430 [1:17:18<8:21:34, 80.68s/it]
|
82 |
13%|ββ | 58/430 [1:18:38<8:19:56, 80.63s/it]
|
83 |
14%|ββ | 59/430 [1:19:59<8:18:24, 80.60s/it]
|
84 |
14%|ββ | 60/430 [1:21:19<8:16:44, 80.55s/it]
|
85 |
|
|
|
86 |
14%|ββ | 60/430 [1:21:19<8:16:44, 80.55s/it]
|
87 |
14%|ββ | 61/430 [1:22:40<8:15:19, 80.54s/it]
|
88 |
14%|ββ | 62/430 [1:24:00<8:13:53, 80.53s/it]
|
89 |
15%|ββ | 63/430 [1:25:21<8:12:48, 80.57s/it]
|
90 |
15%|ββ | 64/430 [1:26:41<8:11:20, 80.55s/it]
|
91 |
15%|ββ | 65/430 [1:28:02<8:09:50, 80.52s/it]
|
92 |
|
|
|
93 |
15%|ββ | 65/430 [1:28:02<8:09:50, 80.52s/it]
|
94 |
15%|ββ | 66/430 [1:29:22<8:08:31, 80.52s/it]
|
95 |
16%|ββ | 67/430 [1:30:43<8:07:03, 80.51s/it]
|
96 |
16%|ββ | 68/430 [1:32:03<8:05:39, 80.50s/it]
|
97 |
16%|ββ | 69/430 [1:33:24<8:04:17, 80.49s/it]
|
98 |
16%|ββ | 70/430 [1:34:44<8:02:50, 80.47s/it]
|
99 |
|
|
|
100 |
16%|ββ | 70/430 [1:34:44<8:02:50, 80.47s/it]
|
101 |
17%|ββ | 71/430 [1:36:05<8:02:07, 80.58s/it]
|
102 |
17%|ββ | 72/430 [1:37:25<8:00:27, 80.52s/it]
|
103 |
17%|ββ | 73/430 [1:38:46<7:58:58, 80.50s/it]
|
104 |
17%|ββ | 74/430 [1:40:06<7:57:28, 80.47s/it]
|
105 |
17%|ββ | 75/430 [1:41:27<7:56:03, 80.46s/it]
|
106 |
|
|
|
107 |
17%|ββ | 75/430 [1:41:27<7:56:03, 80.46s/it]
|
108 |
18%|ββ | 76/430 [1:42:47<7:54:59, 80.51s/it]
|
109 |
18%|ββ | 77/430 [1:44:08<7:53:37, 80.50s/it]
|
110 |
18%|ββ | 78/430 [1:45:28<7:52:21, 80.52s/it]
|
111 |
18%|ββ | 79/430 [1:46:49<7:50:59, 80.51s/it]
|
112 |
19%|ββ | 80/430 [1:48:09<7:49:36, 80.51s/it]
|
113 |
|
|
|
114 |
19%|ββ | 80/430 [1:48:09<7:49:36, 80.51s/it]
|
115 |
19%|ββ | 81/430 [1:49:30<7:48:23, 80.53s/it]
|
116 |
19%|ββ | 82/430 [1:50:50<7:47:05, 80.53s/it]
|
117 |
19%|ββ | 83/430 [1:52:11<7:45:34, 80.50s/it]
|
118 |
20%|ββ | 84/430 [1:53:32<7:45:20, 80.69s/it]
|
119 |
20%|ββ | 85/430 [1:54:52<7:43:34, 80.62s/it]
|
120 |
|
|
|
121 |
20%|ββ | 85/430 [1:54:52<7:43:34, 80.62s/it]
|
122 |
20%|ββ | 86/430 [1:56:13<7:42:01, 80.59s/it]
|
123 |
20%|ββ | 87/430 [1:58:23<9:05:42, 95.46s/it]
|
124 |
20%|ββ | 88/430 [1:59:44<8:38:35, 90.98s/it]
|
125 |
21%|ββ | 89/430 [2:01:04<8:19:08, 87.82s/it]
|
126 |
21%|ββ | 90/430 [2:02:25<8:05:18, 85.64s/it]
|
127 |
|
|
|
128 |
21%|ββ | 90/430 [2:02:25<8:05:18, 85.64s/it]
|
129 |
21%|ββ | 91/430 [2:03:45<7:55:07, 84.09s/it]
|
130 |
21%|βββ | 92/430 [2:05:06<7:47:38, 83.01s/it]
|
131 |
22%|βββ | 93/430 [2:06:26<7:42:05, 82.27s/it]
|
132 |
22%|βββ | 94/430 [2:07:47<7:37:48, 81.75s/it]
|
133 |
22%|βββ | 95/430 [2:09:07<7:34:41, 81.44s/it]
|
134 |
|
|
|
135 |
22%|βββ | 95/430 [2:09:07<7:34:41, 81.44s/it]
|
136 |
22%|βββ | 96/430 [2:10:28<7:31:43, 81.15s/it]
|
137 |
23%|βββ | 97/430 [2:11:48<7:29:16, 80.95s/it]
|
138 |
23%|βββ | 98/430 [2:13:09<7:27:04, 80.80s/it]
|
139 |
23%|βββ | 99/430 [2:14:29<7:25:22, 80.73s/it]
|
140 |
23%|βββ | 100/430 [2:15:50<7:23:40, 80.67s/it]
|
141 |
|
|
|
142 |
23%|βββ | 100/430 [2:15:50<7:23:40, 80.67s/it]
|
143 |
23%|βββ | 101/430 [2:17:10<7:22:04, 80.62s/it]
|
144 |
24%|βββ | 102/430 [2:18:31<7:20:28, 80.58s/it]
|
145 |
24%|βββ | 103/430 [2:19:51<7:18:57, 80.54s/it]
|
146 |
24%|βββ | 104/430 [2:21:12<7:17:41, 80.56s/it]
|
147 |
24%|βββ | 105/430 [2:22:32<7:16:17, 80.55s/it]
|
148 |
|
|
|
149 |
24%|βββ | 105/430 [2:22:32<7:16:17, 80.55s/it]
|
150 |
25%|βββ | 106/430 [2:23:53<7:15:01, 80.56s/it]
|
151 |
25%|βββ | 107/430 [2:25:14<7:13:51, 80.59s/it]
|
152 |
25%|βββ | 108/430 [2:26:34<7:12:27, 80.58s/it]
|
153 |
25%|βββ | 109/430 [2:27:55<7:11:11, 80.60s/it]
|
154 |
26%|βββ | 110/430 [2:29:15<7:09:47, 80.59s/it]
|
155 |
|
|
|
156 |
26%|βββ | 110/430 [2:29:15<7:09:47, 80.59s/it]
|
157 |
26%|βββ | 111/430 [2:30:36<7:08:21, 80.57s/it]
|
158 |
26%|βββ | 112/430 [2:31:57<7:07:20, 80.63s/it]
|
159 |
26%|βββ | 113/430 [2:33:17<7:05:49, 80.60s/it]
|
160 |
27%|βββ | 114/430 [2:34:38<7:04:22, 80.58s/it]
|
161 |
27%|βββ | 115/430 [2:35:58<7:02:59, 80.57s/it]
|
162 |
|
|
|
163 |
27%|βββ | 115/430 [2:35:58<7:02:59, 80.57s/it]
|
164 |
27%|βββ | 116/430 [2:37:20<7:02:34, 80.75s/it]
|
165 |
27%|βββ | 117/430 [2:38:40<7:00:55, 80.69s/it]
|
166 |
27%|βββ | 118/430 [2:40:01<6:59:23, 80.65s/it]
|
167 |
28%|βββ | 119/430 [2:41:21<6:57:54, 80.63s/it]
|
168 |
28%|βββ | 120/430 [2:42:42<6:56:26, 80.60s/it]
|
169 |
|
|
|
170 |
28%|βββ | 120/430 [2:42:42<6:56:26, 80.60s/it]
|
171 |
28%|βββ | 121/430 [2:44:02<6:55:00, 80.58s/it]
|
172 |
28%|βββ | 122/430 [2:45:23<6:53:30, 80.55s/it]
|
173 |
29%|βββ | 123/430 [2:46:43<6:52:17, 80.58s/it]
|
174 |
29%|βββ | 124/430 [2:48:04<6:51:04, 80.60s/it]
|
175 |
29%|βββ | 125/430 [2:49:25<6:49:38, 80.59s/it]
|
176 |
|
|
|
177 |
29%|βββ | 125/430 [2:49:25<6:49:38, 80.59s/it]
|
178 |
29%|βββ | 126/430 [2:50:45<6:48:24, 80.61s/it]
|
179 |
30%|βββ | 127/430 [2:52:06<6:47:00, 80.60s/it]
|
180 |
30%|βββ | 128/430 [2:53:27<6:46:32, 80.77s/it]
|
181 |
30%|βββ | 129/430 [2:54:48<6:44:56, 80.72s/it]
|
182 |
30%|βββ | 130/430 [2:56:57<7:56:43, 95.35s/it]
|
183 |
|
|
|
184 |
30%|βββ | 130/430 [2:56:57<7:56:43, 95.35s/it]
|
185 |
30%|βββ | 131/430 [2:58:18<7:32:58, 90.90s/it]
|
186 |
31%|βββ | 132/430 [2:59:38<7:15:53, 87.76s/it]
|
187 |
31%|βββ | 133/430 [3:00:59<7:03:46, 85.61s/it]
|
188 |
31%|βββ | 134/430 [3:02:19<6:54:52, 84.10s/it]
|
189 |
31%|ββββ | 135/430 [3:03:40<6:48:13, 83.03s/it]
|
190 |
|
|
|
191 |
31%|ββββ | 135/430 [3:03:40<6:48:13, 83.03s/it]
|
192 |
32%|ββββ | 136/430 [3:05:00<6:43:08, 82.27s/it]
|
193 |
32%|ββββ | 137/430 [3:06:21<6:39:15, 81.76s/it]
|
194 |
32%|ββββ | 138/430 [3:07:41<6:36:06, 81.39s/it]
|
195 |
32%|ββββ | 139/430 [3:09:02<6:33:50, 81.20s/it]
|
196 |
33%|ββββ | 140/430 [3:10:23<6:31:28, 81.00s/it]
|
197 |
|
|
|
198 |
33%|ββββ | 140/430 [3:10:23<6:31:28, 81.00s/it]
|
199 |
33%|ββββ | 141/430 [3:11:43<6:29:27, 80.86s/it]
|
200 |
33%|ββββ | 142/430 [3:13:04<6:27:32, 80.74s/it]
|
201 |
33%|ββββ | 143/430 [3:14:24<6:26:05, 80.72s/it]
|
202 |
33%|ββββ | 144/430 [3:15:45<6:24:25, 80.65s/it]
|
203 |
34%|ββββ | 145/430 [3:17:05<6:22:43, 80.57s/it]
|
204 |
|
|
|
205 |
34%|ββββ | 145/430 [3:17:05<6:22:43, 80.57s/it]
|
206 |
34%|ββββ | 146/430 [3:18:26<6:21:11, 80.53s/it]
|
207 |
34%|ββββ | 147/430 [3:19:46<6:19:49, 80.53s/it]
|
208 |
34%|ββββ | 148/430 [3:21:07<6:18:31, 80.54s/it]
|
209 |
35%|ββββ | 149/430 [3:22:27<6:17:16, 80.56s/it]
|
210 |
35%|ββββ | 150/430 [3:23:48<6:15:53, 80.55s/it]
|
211 |
|
|
|
212 |
35%|ββββ | 150/430 [3:23:48<6:15:53, 80.55s/it]Saving model checkpoint to ./results/checkpoint-150
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
213 |
35%|ββββ | 151/430 [3:25:10<6:16:59, 81.07s/it]
|
214 |
35%|ββββ | 152/430 [3:26:31<6:14:42, 80.87s/it]
|
215 |
36%|ββββ | 153/430 [3:27:51<6:12:50, 80.76s/it]
|
216 |
36%|ββββ | 154/430 [3:29:11<6:11:03, 80.67s/it]
|
217 |
36%|ββββ | 155/430 [3:30:32<6:09:25, 80.60s/it]
|
218 |
|
|
|
219 |
36%|ββββ | 155/430 [3:30:32<6:09:25, 80.60s/it]
|
220 |
36%|ββββ | 156/430 [3:31:52<6:07:55, 80.57s/it]
|
221 |
37%|ββββ | 157/430 [3:33:13<6:07:08, 80.69s/it]
|
222 |
37%|ββββ | 158/430 [3:34:34<6:05:33, 80.64s/it]
|
223 |
37%|ββββ | 159/430 [3:35:54<6:04:05, 80.61s/it]
|
224 |
37%|ββββ | 160/430 [3:37:15<6:02:37, 80.58s/it]
|
225 |
|
|
|
226 |
37%|ββββ | 160/430 [3:37:15<6:02:37, 80.58s/it]
|
227 |
37%|ββββ | 161/430 [3:38:36<6:01:14, 80.58s/it]
|
228 |
38%|ββββ | 162/430 [3:39:56<6:00:01, 80.60s/it]
|
229 |
38%|ββββ | 163/430 [3:41:17<5:59:32, 80.80s/it]
|
230 |
38%|ββββ | 164/430 [3:42:38<5:58:27, 80.86s/it]
|
231 |
38%|ββββ | 165/430 [3:43:59<5:56:38, 80.75s/it]
|
232 |
|
|
|
233 |
38%|ββββ | 165/430 [3:43:59<5:56:38, 80.75s/it]
|
234 |
39%|ββββ | 166/430 [3:45:20<5:55:05, 80.70s/it]
|
235 |
39%|ββββ | 167/430 [3:46:40<5:53:32, 80.66s/it]
|
236 |
39%|ββββ | 168/430 [3:48:01<5:52:04, 80.63s/it]
|
237 |
39%|ββββ | 169/430 [3:49:21<5:50:54, 80.67s/it]
|
238 |
40%|ββββ | 170/430 [3:50:42<5:49:25, 80.64s/it]
|
239 |
|
|
|
240 |
40%|ββββ | 170/430 [3:50:42<5:49:25, 80.64s/it]
|
241 |
40%|ββββ | 171/430 [3:52:03<5:48:04, 80.64s/it]
|
242 |
40%|ββββ | 172/430 [3:53:23<5:46:33, 80.59s/it]
|
243 |
40%|ββββ | 173/430 [3:55:33<6:48:29, 95.37s/it]
|
244 |
40%|ββββ | 174/430 [3:56:53<6:27:52, 90.91s/it]
|
245 |
41%|ββββ | 175/430 [3:58:14<6:13:17, 87.83s/it]
|
246 |
|
|
|
247 |
41%|ββββ | 175/430 [3:58:14<6:13:17, 87.83s/it]
|
248 |
41%|ββββ | 176/430 [3:59:35<6:02:43, 85.68s/it]
|
249 |
41%|ββββ | 177/430 [4:00:55<5:54:43, 84.12s/it]
|
250 |
41%|βββββ | 178/430 [4:02:16<5:48:45, 83.04s/it]
|
251 |
42%|βββββ | 179/430 [4:03:36<5:44:09, 82.27s/it]
|
252 |
42%|βββββ | 180/430 [4:04:57<5:40:41, 81.77s/it]
|
253 |
|
|
|
254 |
42%|βββββ | 180/430 [4:04:57<5:40:41, 81.77s/it]
|
255 |
42%|βββββ | 181/430 [4:06:17<5:37:50, 81.41s/it]
|
256 |
42%|βββββ | 182/430 [4:07:38<5:35:22, 81.14s/it]
|
257 |
43%|βββββ | 183/430 [4:08:58<5:33:16, 80.96s/it]
|
258 |
43%|βββββ | 184/430 [4:10:19<5:31:25, 80.83s/it]
|
259 |
43%|βββββ | 185/430 [4:11:40<5:29:40, 80.74s/it]
|
260 |
|
|
|
261 |
43%|βββββ | 185/430 [4:11:40<5:29:40, 80.74s/it]
|
262 |
43%|βββββ | 186/430 [4:13:00<5:28:03, 80.67s/it]
|
263 |
43%|βββββ | 187/430 [4:14:21<5:26:44, 80.68s/it]
|
264 |
44%|βββββ | 188/430 [4:15:41<5:25:24, 80.68s/it]
|
265 |
44%|βββββ | 189/430 [4:17:02<5:23:51, 80.63s/it]
|
266 |
44%|βββββ | 190/430 [4:18:22<5:22:20, 80.58s/it]
|
267 |
|
|
|
268 |
44%|βββββ | 190/430 [4:18:22<5:22:20, 80.58s/it]
|
269 |
44%|βββββ | 191/430 [4:19:43<5:20:53, 80.56s/it]
|
270 |
45%|βββββ | 192/430 [4:21:03<5:19:33, 80.56s/it]
|
271 |
45%|βββββ | 193/430 [4:22:24<5:18:45, 80.70s/it]
|
272 |
45%|βββββ | 194/430 [4:23:45<5:17:14, 80.65s/it]
|
273 |
45%|βββββ | 195/430 [4:25:06<5:15:47, 80.63s/it]
|
274 |
|
|
|
275 |
45%|βββββ | 195/430 [4:25:06<5:15:47, 80.63s/it]
|
276 |
46%|βββββ | 196/430 [4:26:26<5:14:14, 80.57s/it]
|
277 |
46%|βββββ | 197/430 [4:27:47<5:12:46, 80.54s/it]
|
278 |
46%|βββββ | 198/430 [4:29:07<5:11:27, 80.55s/it]
|
279 |
46%|βββββ | 199/430 [4:30:28<5:10:06, 80.55s/it]
|
280 |
47%|βββββ | 200/430 [4:31:48<5:08:45, 80.55s/it]
|
281 |
|
|
|
282 |
47%|βββββ | 200/430 [4:31:48<5:08:45, 80.55s/it]
|
283 |
47%|βββββ | 201/430 [4:33:09<5:07:25, 80.55s/it]
|
284 |
47%|βββββ | 202/430 [4:34:29<5:06:03, 80.54s/it]
|
285 |
47%|βββββ | 203/430 [4:35:50<5:04:47, 80.56s/it]
|
286 |
47%|βββββ | 204/430 [4:37:10<5:03:25, 80.55s/it]
|
287 |
48%|βββββ | 205/430 [4:38:31<5:02:05, 80.56s/it]
|
288 |
|
|
|
289 |
48%|βββββ | 205/430 [4:38:31<5:02:05, 80.56s/it]
|
290 |
48%|βββββ | 206/430 [4:39:52<5:00:48, 80.57s/it]
|
291 |
48%|βββββ | 207/430 [4:41:12<4:59:23, 80.55s/it]
|
292 |
48%|βββββ | 208/430 [4:42:33<4:58:04, 80.56s/it]
|
293 |
49%|βββββ | 209/430 [4:43:53<4:56:40, 80.54s/it]
|
294 |
49%|βββββ | 210/430 [4:45:14<4:55:18, 80.54s/it]
|
295 |
|
|
|
296 |
49%|βββββ | 210/430 [4:45:14<4:55:18, 80.54s/it]
|
297 |
49%|βββββ | 211/430 [4:46:35<4:54:27, 80.67s/it]
|
298 |
49%|βββββ | 212/430 [4:47:56<4:53:37, 80.81s/it]
|
299 |
50%|βββββ | 213/430 [4:49:16<4:52:03, 80.75s/it]
|
300 |
50%|βββββ | 214/430 [4:50:37<4:50:43, 80.75s/it]
|
301 |
50%|βββββ | 215/430 [4:51:58<4:49:04, 80.67s/it]
|
302 |
|
|
|
303 |
50%|βββββ | 215/430 [4:51:58<4:49:04, 80.67s/it]
|
304 |
50%|βββββ | 216/430 [4:54:08<5:40:27, 95.45s/it]
|
305 |
50%|βββββ | 217/430 [4:55:28<5:22:58, 90.98s/it]
|
306 |
51%|βββββ | 218/430 [4:56:49<5:10:22, 87.84s/it]
|
307 |
51%|βββββ | 219/430 [4:58:09<5:01:10, 85.64s/it]
|
308 |
51%|βββββ | 220/430 [4:59:30<4:54:31, 84.15s/it]
|
309 |
|
|
|
310 |
51%|βββββ | 220/430 [4:59:30<4:54:31, 84.15s/it]
|
311 |
51%|ββββββ | 221/430 [5:00:50<4:49:24, 83.09s/it]
|
312 |
52%|ββββββ | 222/430 [5:02:11<4:45:37, 82.39s/it]
|
313 |
52%|ββββββ | 223/430 [5:03:32<4:42:20, 81.84s/it]
|
314 |
52%|ββββββ | 224/430 [5:04:52<4:39:35, 81.44s/it]
|
315 |
52%|ββββββ | 225/430 [5:06:13<4:37:20, 81.17s/it]
|
316 |
|
|
|
317 |
52%|ββββββ | 225/430 [5:06:13<4:37:20, 81.17s/it]
|
318 |
53%|ββββββ | 226/430 [5:07:33<4:35:24, 81.00s/it]
|
319 |
53%|ββββββ | 227/430 [5:08:54<4:33:33, 80.85s/it]
|
320 |
53%|ββββββ | 228/430 [5:10:14<4:31:47, 80.73s/it]
|
321 |
53%|ββββββ | 229/430 [5:11:35<4:30:14, 80.67s/it]
|
322 |
53%|ββββββ | 230/430 [5:12:55<4:28:43, 80.62s/it]
|
323 |
|
|
|
324 |
53%|ββββββ | 230/430 [5:12:55<4:28:43, 80.62s/it]
|
325 |
54%|ββββββ | 231/430 [5:14:16<4:27:16, 80.58s/it]
|
326 |
54%|ββββββ | 232/430 [5:15:36<4:25:49, 80.55s/it]
|
327 |
54%|ββββββ | 233/430 [5:16:57<4:24:23, 80.53s/it]
|
328 |
54%|ββββββ | 234/430 [5:18:17<4:22:58, 80.50s/it]
|
329 |
55%|ββββββ | 235/430 [5:19:38<4:21:32, 80.48s/it]
|
330 |
|
|
|
331 |
55%|ββββββ | 235/430 [5:19:38<4:21:32, 80.48s/it]
|
332 |
55%|ββββββ | 236/430 [5:20:58<4:20:16, 80.50s/it]
|
333 |
55%|ββββββ | 237/430 [5:22:19<4:18:53, 80.49s/it]
|
334 |
55%|ββββββ | 238/430 [5:23:39<4:17:33, 80.49s/it]
|
335 |
56%|ββββββ | 239/430 [5:25:00<4:16:10, 80.47s/it]
|
336 |
56%|ββββββ | 240/430 [5:26:20<4:14:47, 80.46s/it]
|
337 |
|
|
|
338 |
56%|ββββββ | 240/430 [5:26:20<4:14:47, 80.46s/it]
|
339 |
56%|ββββββ | 241/430 [5:27:40<4:13:24, 80.45s/it]
|
340 |
56%|ββββββ | 242/430 [5:29:01<4:12:06, 80.46s/it]
|
341 |
57%|ββββββ | 243/430 [5:30:21<4:10:48, 80.47s/it]
|
342 |
57%|ββββββ | 244/430 [5:31:42<4:09:29, 80.48s/it]
|
343 |
57%|ββββββ | 245/430 [5:33:02<4:08:08, 80.48s/it]
|
344 |
|
|
|
345 |
57%|ββββββ | 245/430 [5:33:02<4:08:08, 80.48s/it]
|
346 |
57%|ββββββ | 246/430 [5:34:23<4:06:46, 80.47s/it]
|
347 |
57%|ββββββ | 247/430 [5:35:44<4:05:59, 80.65s/it]
|
348 |
58%|ββββββ | 248/430 [5:37:04<4:04:28, 80.59s/it]
|
349 |
58%|ββββββ | 249/430 [5:38:25<4:03:06, 80.59s/it]
|
350 |
58%|ββββββ | 250/430 [5:39:46<4:01:45, 80.59s/it]
|
351 |
|
|
|
352 |
58%|ββββββ | 250/430 [5:39:46<4:01:45, 80.59s/it]
|
353 |
58%|ββββββ | 251/430 [5:41:06<4:00:32, 80.63s/it]
|
354 |
59%|ββββββ | 252/430 [5:42:27<3:59:37, 80.77s/it]
|
355 |
59%|ββββββ | 253/430 [5:43:48<3:58:03, 80.70s/it]
|
356 |
59%|ββββββ | 254/430 [5:45:08<3:56:32, 80.64s/it]
|
357 |
59%|ββββββ | 255/430 [5:46:29<3:55:05, 80.60s/it]
|
358 |
|
|
|
359 |
59%|ββββββ | 255/430 [5:46:29<3:55:05, 80.60s/it]
|
360 |
60%|ββββββ | 256/430 [5:47:50<3:54:05, 80.72s/it]
|
361 |
60%|ββββββ | 257/430 [5:49:10<3:52:33, 80.65s/it]
|
362 |
60%|ββββββ | 258/430 [5:50:31<3:51:09, 80.64s/it]
|
363 |
60%|ββββββ | 259/430 [5:52:40<4:31:29, 95.26s/it]
|
364 |
60%|ββββββ | 260/430 [5:54:01<4:17:26, 90.86s/it]
|
365 |
|
|
|
366 |
60%|ββββββ | 260/430 [5:54:01<4:17:26, 90.86s/it]
|
367 |
61%|ββββββ | 261/430 [5:55:22<4:07:10, 87.75s/it]
|
368 |
61%|ββββββ | 262/430 [5:56:42<3:59:47, 85.64s/it]
|
369 |
61%|ββββββ | 263/430 [5:58:03<3:54:02, 84.09s/it]
|
370 |
61%|βββββββ | 264/430 [5:59:23<3:49:37, 83.00s/it]
|
371 |
62%|βββββββ | 265/430 [6:00:44<3:46:08, 82.23s/it]
|
372 |
|
|
|
373 |
62%|βββββββ | 265/430 [6:00:44<3:46:08, 82.23s/it]
|
374 |
62%|βββββββ | 266/430 [6:02:04<3:43:19, 81.70s/it]
|
375 |
62%|βββββββ | 267/430 [6:03:25<3:40:59, 81.34s/it]
|
376 |
62%|βββββββ | 268/430 [6:04:45<3:39:08, 81.16s/it]
|
377 |
63%|βββββββ | 269/430 [6:06:06<3:37:16, 80.97s/it]
|
378 |
63%|βββββββ | 270/430 [6:07:26<3:35:33, 80.84s/it]
|
379 |
|
|
|
380 |
63%|βββββββ | 270/430 [6:07:26<3:35:33, 80.84s/it]
|
381 |
63%|βββββββ | 271/430 [6:08:47<3:34:06, 80.80s/it]
|
382 |
63%|βββββββ | 272/430 [6:10:08<3:32:33, 80.72s/it]
|
383 |
63%|βββββββ | 273/430 [6:11:28<3:31:01, 80.65s/it]
|
384 |
64%|βββββββ | 274/430 [6:12:49<3:29:32, 80.60s/it]
|
385 |
64%|βββββββ | 275/430 [6:14:09<3:28:03, 80.54s/it]
|
386 |
|
|
|
387 |
64%|βββββββ | 275/430 [6:14:09<3:28:03, 80.54s/it]
|
388 |
64%|βββββββ | 276/430 [6:15:29<3:26:41, 80.53s/it]
|
389 |
64%|βββββββ | 277/430 [6:16:50<3:25:18, 80.51s/it]
|
390 |
65%|βββββββ | 278/430 [6:18:10<3:23:55, 80.50s/it]
|
391 |
65%|βββββββ | 279/430 [6:19:31<3:22:38, 80.52s/it]
|
392 |
65%|βββββββ | 280/430 [6:20:52<3:21:30, 80.60s/it]
|
393 |
|
|
|
394 |
65%|βββββββ | 280/430 [6:20:52<3:21:30, 80.60s/it]
|
395 |
65%|βββββββ | 281/430 [6:22:12<3:20:06, 80.58s/it]
|
396 |
66%|βββββββ | 282/430 [6:23:33<3:18:45, 80.58s/it]
|
397 |
66%|βββββββ | 283/430 [6:24:54<3:17:25, 80.58s/it]
|
398 |
66%|βββββββ | 284/430 [6:26:14<3:16:05, 80.59s/it]
|
399 |
66%|βββββββ | 285/430 [6:27:35<3:14:40, 80.56s/it]
|
400 |
|
|
|
401 |
66%|βββββββ | 285/430 [6:27:35<3:14:40, 80.56s/it]
|
402 |
67%|βββββββ | 286/430 [6:28:55<3:13:19, 80.55s/it]
|
403 |
67%|βββββββ | 287/430 [6:30:16<3:11:55, 80.53s/it]
|
404 |
67%|βββββββ | 288/430 [6:31:37<3:10:52, 80.65s/it]
|
405 |
67%|βββββββ | 289/430 [6:32:57<3:09:26, 80.62s/it]
|
406 |
67%|βββββββ | 290/430 [6:34:18<3:07:59, 80.57s/it]
|
407 |
|
|
|
408 |
67%|βββββββ | 290/430 [6:34:18<3:07:59, 80.57s/it]
|
409 |
68%|βββββββ | 291/430 [6:35:39<3:06:56, 80.69s/it]
|
410 |
68%|βββββββ | 292/430 [6:36:59<3:05:26, 80.63s/it]
|
411 |
68%|βββββββ | 293/430 [6:38:20<3:04:03, 80.61s/it]
|
412 |
68%|βββββββ | 294/430 [6:39:40<3:02:37, 80.57s/it]
|
413 |
69%|βββββββ | 295/430 [6:41:01<3:01:16, 80.56s/it]
|
414 |
|
|
|
415 |
69%|βββββββ | 295/430 [6:41:01<3:01:16, 80.56s/it]
|
416 |
69%|βββββββ | 296/430 [6:42:21<2:59:53, 80.55s/it]
|
417 |
69%|βββββββ | 297/430 [6:43:42<2:58:55, 80.72s/it]
|
418 |
69%|βββββββ | 298/430 [6:45:03<2:57:26, 80.66s/it]
|
419 |
70%|βββββββ | 299/430 [6:46:23<2:55:57, 80.59s/it]
|
420 |
70%|βββββββ | 300/430 [6:47:44<2:54:31, 80.55s/it]
|
421 |
|
|
|
422 |
70%|βββββββ | 300/430 [6:47:44<2:54:31, 80.55s/it]Saving model checkpoint to ./results/checkpoint-300
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
423 |
70%|βββββββ | 301/430 [6:49:06<2:54:08, 80.99s/it]
|
424 |
70%|βββββββ | 302/430 [6:51:15<3:24:02, 95.64s/it]
|
425 |
70%|βββββββ | 303/430 [6:52:36<3:12:48, 91.09s/it]
|
426 |
71%|βββββββ | 304/430 [6:53:56<3:04:36, 87.91s/it]
|
427 |
71%|βββββββ | 305/430 [6:55:17<2:58:30, 85.68s/it]
|
428 |
|
|
|
429 |
71%|βββββββ | 305/430 [6:55:17<2:58:30, 85.68s/it]
|
430 |
71%|βββββββ | 306/430 [6:56:38<2:54:00, 84.19s/it]
|
431 |
71%|ββββββββ | 307/430 [6:57:59<2:50:44, 83.29s/it]
|
432 |
72%|ββββββββ | 308/430 [6:59:19<2:47:40, 82.46s/it]
|
433 |
72%|ββββββββ | 309/430 [7:00:40<2:45:10, 81.91s/it]
|
434 |
72%|ββββββββ | 310/430 [7:02:00<2:42:57, 81.48s/it]
|
435 |
|
|
|
436 |
72%|ββββββββ | 310/430 [7:02:00<2:42:57, 81.48s/it]
|
437 |
72%|ββββββββ | 311/430 [7:03:21<2:41:01, 81.18s/it]
|
438 |
73%|ββββββββ | 312/430 [7:04:41<2:39:14, 80.97s/it]
|
439 |
73%|ββββββββ | 313/430 [7:06:02<2:37:35, 80.82s/it]
|
440 |
73%|ββββββββ | 314/430 [7:07:22<2:36:02, 80.71s/it]
|
441 |
73%|ββββββββ | 315/430 [7:08:43<2:34:33, 80.64s/it]
|
442 |
|
|
|
443 |
73%|ββββββββ | 315/430 [7:08:43<2:34:33, 80.64s/it]
|
444 |
73%|ββββββββ | 316/430 [7:10:03<2:33:10, 80.62s/it]
|
445 |
74%|ββββββββ | 317/430 [7:11:24<2:31:46, 80.59s/it]
|
446 |
74%|ββββββββ | 318/430 [7:12:44<2:30:23, 80.56s/it]
|
447 |
74%|ββββββββ | 319/430 [7:14:05<2:29:01, 80.56s/it]
|
448 |
74%|ββββββββ | 320/430 [7:15:26<2:27:42, 80.57s/it]
|
449 |
|
|
|
450 |
74%|ββββββββ | 320/430 [7:15:26<2:27:42, 80.57s/it]
|
451 |
75%|ββββββββ | 321/430 [7:16:46<2:26:19, 80.54s/it]
|
452 |
75%|ββββββββ | 322/430 [7:18:07<2:24:58, 80.54s/it]
|
453 |
75%|ββββββββ | 323/430 [7:19:27<2:23:39, 80.55s/it]
|
454 |
75%|ββββββββ | 324/430 [7:20:48<2:22:15, 80.53s/it]
|
455 |
76%|ββββββββ | 325/430 [7:22:08<2:20:55, 80.53s/it]
|
456 |
|
|
|
457 |
76%|ββββββββ | 325/430 [7:22:08<2:20:55, 80.53s/it]
|
458 |
76%|ββββββββ | 326/430 [7:23:29<2:19:34, 80.52s/it]
|
459 |
76%|ββββββββ | 327/430 [7:24:49<2:18:11, 80.50s/it]
|
460 |
76%|ββββββββ | 328/430 [7:26:10<2:16:52, 80.52s/it]
|
461 |
77%|ββββββββ | 329/430 [7:27:30<2:15:32, 80.52s/it]
|
462 |
77%|ββββββββ | 330/430 [7:28:51<2:14:13, 80.53s/it]
|
463 |
|
|
|
464 |
77%|ββββββββ | 330/430 [7:28:51<2:14:13, 80.53s/it]
|
465 |
77%|ββββββββ | 331/430 [7:30:11<2:12:57, 80.58s/it]
|
466 |
77%|ββββββββ | 332/430 [7:31:32<2:11:44, 80.66s/it]
|
467 |
77%|ββββββββ | 333/430 [7:32:53<2:10:32, 80.75s/it]
|
468 |
78%|ββββββββ | 334/430 [7:34:14<2:09:04, 80.67s/it]
|
469 |
78%|ββββββββ | 335/430 [7:35:34<2:07:39, 80.62s/it]
|
470 |
|
|
|
471 |
78%|ββββββββ | 335/430 [7:35:34<2:07:39, 80.62s/it]
|
472 |
78%|ββββββββ | 336/430 [7:36:55<2:06:30, 80.75s/it]
|
473 |
78%|ββββββββ | 337/430 [7:38:16<2:05:08, 80.73s/it]
|
474 |
79%|ββββββββ | 338/430 [7:39:36<2:03:41, 80.66s/it]
|
475 |
79%|ββββββββ | 339/430 [7:40:57<2:02:15, 80.61s/it]
|
476 |
79%|ββββββββ | 340/430 [7:42:17<2:00:50, 80.56s/it]
|
477 |
|
|
|
478 |
79%|ββββββββ | 340/430 [7:42:17<2:00:50, 80.56s/it]
|
479 |
79%|ββββββββ | 341/430 [7:43:38<1:59:27, 80.53s/it]
|
480 |
80%|ββββββββ | 342/430 [7:44:58<1:58:06, 80.53s/it]
|
481 |
80%|ββββββββ | 343/430 [7:46:19<1:56:47, 80.55s/it]
|
482 |
80%|ββββββββ | 344/430 [7:47:40<1:55:30, 80.59s/it]
|
483 |
80%|ββββββββ | 345/430 [7:49:49<2:14:57, 95.26s/it]
|
484 |
|
|
|
485 |
80%|ββββββββ | 345/430 [7:49:49<2:14:57, 95.26s/it]
|
486 |
80%|ββββββββ | 346/430 [7:51:10<2:07:10, 90.84s/it]
|
487 |
81%|ββββββββ | 347/430 [7:52:30<2:01:21, 87.73s/it]
|
488 |
81%|ββββββββ | 348/430 [7:53:51<1:56:55, 85.56s/it]
|
489 |
81%|ββββββββ | 349/430 [7:55:11<1:53:26, 84.03s/it]
|
490 |
81%|βββββββββ | 350/430 [7:56:32<1:50:39, 83.00s/it]
|
491 |
|
|
|
492 |
81%|βββββββββ | 350/430 [7:56:32<1:50:39, 83.00s/it]
|
493 |
82%|βββββββββ | 351/430 [7:57:52<1:48:17, 82.24s/it]
|
494 |
82%|βββββββββ | 352/430 [7:59:13<1:46:14, 81.72s/it]
|
495 |
82%|βββββββββ | 353/430 [8:00:34<1:44:40, 81.56s/it]
|
496 |
82%|βββββββββ | 354/430 [8:01:54<1:42:53, 81.22s/it]
|
497 |
83%|βββββββββ | 355/430 [8:03:15<1:41:14, 80.99s/it]
|
498 |
|
|
|
499 |
83%|βββββββββ | 355/430 [8:03:15<1:41:14, 80.99s/it]
|
500 |
83%|βββββββββ | 356/430 [8:04:36<1:39:57, 81.05s/it]
|
501 |
83%|βββββββββ | 357/430 [8:05:56<1:38:25, 80.90s/it]
|
502 |
83%|βββββββββ | 358/430 [8:07:17<1:36:55, 80.77s/it]
|
503 |
83%|βββββββββ | 359/430 [8:08:38<1:35:30, 80.71s/it]
|
504 |
84%|βββββββββ | 360/430 [8:09:58<1:34:03, 80.63s/it]
|
505 |
|
|
|
506 |
84%|βββββββββ | 360/430 [8:09:58<1:34:03, 80.63s/it]
|
507 |
84%|βββββββββ | 361/430 [8:11:18<1:32:39, 80.57s/it]
|
508 |
84%|βββββββββ | 362/430 [8:12:39<1:31:17, 80.55s/it]
|
509 |
84%|βββββββββ | 363/430 [8:13:59<1:29:56, 80.54s/it]
|
510 |
85%|βββββββββ | 364/430 [8:15:20<1:28:34, 80.52s/it]
|
511 |
85%|βββββββββ | 365/430 [8:16:40<1:27:12, 80.50s/it]
|
512 |
|
|
|
513 |
85%|βββββββββ | 365/430 [8:16:40<1:27:12, 80.50s/it]
|
514 |
85%|βββββββββ | 366/430 [8:18:01<1:25:49, 80.47s/it]
|
515 |
85%|βββββββββ | 367/430 [8:19:21<1:24:27, 80.44s/it]
|
516 |
86%|βββββββββ | 368/430 [8:20:42<1:23:10, 80.49s/it]
|
517 |
86%|βββββββββ | 369/430 [8:22:02<1:21:48, 80.47s/it]
|
518 |
86%|βββββββββ | 370/430 [8:23:23<1:20:29, 80.49s/it]
|
519 |
|
|
|
520 |
86%|βββββββββ | 370/430 [8:23:23<1:20:29, 80.49s/it]
|
521 |
86%|βββββββββ | 371/430 [8:24:43<1:19:09, 80.51s/it]
|
522 |
87%|βββββββββ | 372/430 [8:26:04<1:17:55, 80.61s/it]
|
523 |
87%|βββββββββ | 373/430 [8:27:25<1:16:32, 80.57s/it]
|
524 |
87%|βββββββββ | 374/430 [8:28:45<1:15:10, 80.55s/it]
|
525 |
87%|βββββββββ | 375/430 [8:30:06<1:13:49, 80.54s/it]
|
526 |
|
|
|
527 |
87%|βββββββββ | 375/430 [8:30:06<1:13:49, 80.54s/it]
|
528 |
87%|βββββββββ | 376/430 [8:31:26<1:12:29, 80.54s/it]
|
529 |
88%|βββββββββ | 377/430 [8:32:47<1:11:07, 80.53s/it]
|
530 |
88%|βββββββββ | 378/430 [8:34:07<1:09:47, 80.53s/it]
|
531 |
88%|βββββββββ | 379/430 [8:35:28<1:08:26, 80.52s/it]
|
532 |
88%|βββββββββ | 380/430 [8:36:48<1:07:07, 80.55s/it]
|
533 |
|
|
|
534 |
88%|βββββββββ | 380/430 [8:36:48<1:07:07, 80.55s/it]
|
535 |
89%|βββββββββ | 381/430 [8:38:09<1:05:47, 80.56s/it]
|
536 |
89%|βββββββββ | 382/430 [8:39:29<1:04:25, 80.53s/it]
|
537 |
89%|βββββββββ | 383/430 [8:40:50<1:03:04, 80.53s/it]
|
538 |
89%|βββββββββ | 384/430 [8:42:10<1:01:43, 80.51s/it]
|
539 |
90%|βββββββββ | 385/430 [8:43:31<1:00:22, 80.49s/it]
|
540 |
|
|
|
541 |
90%|βββββββββ | 385/430 [8:43:31<1:00:22, 80.49s/it]
|
542 |
90%|βββββββββ | 386/430 [8:44:52<59:06, 80.61s/it]
|
543 |
90%|βββββββββ | 387/430 [8:46:12<57:43, 80.55s/it]
|
544 |
90%|βββββββββ | 388/430 [8:48:22<1:06:42, 95.31s/it]
|
545 |
90%|βββββββββ | 389/430 [8:49:42<1:02:05, 90.85s/it]
|
546 |
91%|βββββββββ | 390/430 [8:51:03<58:29, 87.73s/it]
|
547 |
|
|
|
548 |
91%|βββββββββ | 390/430 [8:51:03<58:29, 87.73s/it]
|
549 |
91%|βββββββββ | 391/430 [8:52:23<55:37, 85.58s/it]
|
550 |
91%|βββββββββ | 392/430 [8:53:44<53:14, 84.05s/it]
|
551 |
91%|ββββββββββ| 393/430 [8:55:04<51:10, 82.98s/it]
|
552 |
92%|ββββββββββ| 394/430 [8:56:25<49:25, 82.39s/it]
|
553 |
92%|ββββββββββ| 395/430 [8:57:46<47:43, 81.82s/it]
|
554 |
|
|
|
555 |
92%|ββββββββββ| 395/430 [8:57:46<47:43, 81.82s/it]
|
556 |
92%|ββββββββββ| 396/430 [8:59:06<46:09, 81.45s/it]
|
557 |
92%|ββββββββββ| 397/430 [9:00:27<44:37, 81.15s/it]
|
558 |
93%|ββββββββββ| 398/430 [9:01:47<43:09, 80.93s/it]
|
559 |
93%|ββββββββββ| 399/430 [9:03:08<41:44, 80.80s/it]
|
560 |
93%|ββββββββββ| 400/430 [9:04:28<40:20, 80.70s/it]
|
561 |
|
|
|
562 |
93%|ββββββββββ| 400/430 [9:04:28<40:20, 80.70s/it]
|
563 |
93%|ββββββββββ| 401/430 [9:05:49<38:58, 80.65s/it]
|
564 |
93%|ββββββββββ| 402/430 [9:07:09<37:38, 80.65s/it]
|
565 |
94%|ββββββββββ| 403/430 [9:08:30<36:16, 80.59s/it]
|
566 |
94%|ββββββββββ| 404/430 [9:09:50<34:54, 80.56s/it]
|
567 |
94%|ββββββββββ| 405/430 [9:11:11<33:33, 80.54s/it]
|
568 |
|
|
|
569 |
94%|ββββββββββ| 405/430 [9:11:11<33:33, 80.54s/it]
|
570 |
94%|ββββββββββ| 406/430 [9:12:32<32:14, 80.61s/it]
|
571 |
95%|ββββββββββ| 407/430 [9:13:52<30:52, 80.56s/it]
|
572 |
95%|ββββββββββ| 408/430 [9:15:12<29:31, 80.54s/it]
|
573 |
95%|ββββββββββ| 409/430 [9:16:33<28:11, 80.54s/it]
|
574 |
95%|ββββββββββ| 410/430 [9:17:53<26:50, 80.51s/it]
|
575 |
|
|
|
576 |
95%|ββββββββββ| 410/430 [9:17:53<26:50, 80.51s/it]
|
577 |
96%|ββββββββββ| 411/430 [9:19:15<25:33, 80.70s/it]
|
578 |
96%|ββββββββββ| 412/430 [9:20:35<24:11, 80.65s/it]
|
579 |
96%|ββββββββββ| 413/430 [9:21:56<22:50, 80.61s/it]
|
580 |
96%|ββββββββββ| 414/430 [9:23:16<21:28, 80.56s/it]
|
581 |
97%|ββββββββββ| 415/430 [9:24:37<20:09, 80.62s/it]
|
582 |
|
|
|
583 |
97%|ββββββββββ| 415/430 [9:24:37<20:09, 80.62s/it]
|
584 |
97%|ββββββββββ| 416/430 [9:25:57<18:48, 80.58s/it]
|
585 |
97%|ββββββββββ| 417/430 [9:27:18<17:27, 80.54s/it]
|
586 |
97%|ββββββββββ| 418/430 [9:28:38<16:06, 80.54s/it]
|
587 |
97%|ββββββββββ| 419/430 [9:29:59<14:45, 80.51s/it]
|
588 |
98%|ββββββββββ| 420/430 [9:31:20<13:26, 80.65s/it]
|
589 |
|
|
|
590 |
98%|ββββββββββ| 420/430 [9:31:20<13:26, 80.65s/it]
|
591 |
98%|ββββββββββ| 421/430 [9:32:40<12:05, 80.59s/it]
|
592 |
98%|ββββββββββ| 422/430 [9:34:01<10:44, 80.56s/it]
|
593 |
98%|ββββββββββ| 423/430 [9:35:21<09:23, 80.53s/it]
|
594 |
99%|ββββββββββ| 424/430 [9:36:42<08:03, 80.54s/it]
|
595 |
99%|ββββββββββ| 425/430 [9:38:02<06:42, 80.52s/it]
|
596 |
|
|
|
597 |
99%|ββββββββββ| 425/430 [9:38:02<06:42, 80.52s/it]
|
598 |
99%|ββββββββββ| 426/430 [9:39:23<05:22, 80.52s/it]
|
599 |
99%|ββββββββββ| 427/430 [9:40:43<04:01, 80.52s/it]
|
600 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
601 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
nohup: ignoring input
|
2 |
+
[2023-02-20 17:05:49,355] [WARNING] [runner.py:186:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only.
|
3 |
+
[2023-02-20 17:05:49,405] [INFO] [runner.py:548:main] cmd = /opt/conda/bin/python3 -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMSwgMiwgMywgNCwgNSwgNl19 --master_addr=127.0.0.1 --master_port=29500 --enable_each_rank_log=None tune_gpt.py --deepspeed deepspeed.json --upload-experiment
|
4 |
+
/opt/conda/lib/python3.8/site-packages/xgboost/compat.py:36: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
|
5 |
+
from pandas import MultiIndex, Int64Index
|
6 |
+
[2023-02-20 17:05:51,897] [INFO] [launch.py:135:main] 0 NCCL_VERSION=2.11.4
|
7 |
+
[2023-02-20 17:05:51,897] [INFO] [launch.py:142:main] WORLD INFO DICT: {'localhost': [0, 1, 2, 3, 4, 5, 6]}
|
8 |
+
[2023-02-20 17:05:51,897] [INFO] [launch.py:148:main] nnodes=1, num_local_procs=7, node_rank=0
|
9 |
+
[2023-02-20 17:05:51,897] [INFO] [launch.py:161:main] global_rank_mapping=defaultdict(<class 'list'>, {'localhost': [0, 1, 2, 3, 4, 5, 6]})
|
10 |
+
[2023-02-20 17:05:51,897] [INFO] [launch.py:162:main] dist_world_size=7
|
11 |
+
[2023-02-20 17:05:51,897] [INFO] [launch.py:164:main] Setting CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6
|
12 |
+
/opt/conda/lib/python3.8/site-packages/xgboost/compat.py:36: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
|
13 |
+
from pandas import MultiIndex, Int64Index
|
14 |
+
/opt/conda/lib/python3.8/site-packages/xgboost/compat.py:36: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
|
15 |
+
from pandas import MultiIndex, Int64Index
|
16 |
+
/opt/conda/lib/python3.8/site-packages/xgboost/compat.py:36: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
|
17 |
+
from pandas import MultiIndex, Int64Index
|
18 |
+
/opt/conda/lib/python3.8/site-packages/xgboost/compat.py:36: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
|
19 |
+
from pandas import MultiIndex, Int64Index
|
20 |
+
/opt/conda/lib/python3.8/site-packages/xgboost/compat.py:36: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
|
21 |
+
from pandas import MultiIndex, Int64Index
|
22 |
+
/opt/conda/lib/python3.8/site-packages/xgboost/compat.py:36: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
|
23 |
+
from pandas import MultiIndex, Int64Index
|
24 |
+
/opt/conda/lib/python3.8/site-packages/xgboost/compat.py:36: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
|
25 |
+
from pandas import MultiIndex, Int64Index
|
26 |
+
No config specified, defaulting to: apps/all
|
27 |
+
No config specified, defaulting to: apps/all
|
28 |
+
Found cached dataset apps (/home/.cache/huggingface/datasets/codeparrot___apps/all/0.0.0/04ac807715d07d6e5cc580f59cdc8213cd7dc4529d0bb819cca72c9f8e8c1aa5)
|
29 |
+
Found cached dataset apps (/home/.cache/huggingface/datasets/codeparrot___apps/all/0.0.0/04ac807715d07d6e5cc580f59cdc8213cd7dc4529d0bb819cca72c9f8e8c1aa5)
|
30 |
+
No config specified, defaulting to: apps/all
|
31 |
+
Found cached dataset apps (/home/.cache/huggingface/datasets/codeparrot___apps/all/0.0.0/04ac807715d07d6e5cc580f59cdc8213cd7dc4529d0bb819cca72c9f8e8c1aa5)
|
32 |
+
No config specified, defaulting to: apps/all
|
33 |
+
Found cached dataset apps (/home/.cache/huggingface/datasets/codeparrot___apps/all/0.0.0/04ac807715d07d6e5cc580f59cdc8213cd7dc4529d0bb819cca72c9f8e8c1aa5)
|
34 |
+
No config specified, defaulting to: apps/all
|
35 |
+
Found cached dataset apps (/home/.cache/huggingface/datasets/codeparrot___apps/all/0.0.0/04ac807715d07d6e5cc580f59cdc8213cd7dc4529d0bb819cca72c9f8e8c1aa5)
|
36 |
+
No config specified, defaulting to: apps/all
|
37 |
+
No config specified, defaulting to: apps/all
|
38 |
+
Found cached dataset apps (/home/.cache/huggingface/datasets/codeparrot___apps/all/0.0.0/04ac807715d07d6e5cc580f59cdc8213cd7dc4529d0bb819cca72c9f8e8c1aa5)
|
39 |
+
Found cached dataset apps (/home/.cache/huggingface/datasets/codeparrot___apps/all/0.0.0/04ac807715d07d6e5cc580f59cdc8213cd7dc4529d0bb819cca72c9f8e8c1aa5)
|
40 |
+
Max length: 2048
|
41 |
+
PyTorch: setting up devices
|
42 |
+
Max length: 2048
|
43 |
+
PyTorch: setting up devices
|
44 |
+
Max length: 2048
|
45 |
+
PyTorch: setting up devices
|
46 |
+
Max length: 2048
|
47 |
+
PyTorch: setting up devices
|
48 |
+
[2023-02-20 17:06:11,414] [INFO] [comm.py:657:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
|
49 |
+
Max length: 2048
|
50 |
+
PyTorch: setting up devices
|
51 |
+
Max length: 2048
|
52 |
+
PyTorch: setting up devices
|
53 |
+
Max length: 2048
|
54 |
+
PyTorch: setting up devices
|
55 |
+
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
|
56 |
+
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
|
57 |
+
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
|
58 |
+
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
|
59 |
+
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
|
60 |
+
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
|
61 |
+
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
|
62 |
+
GPU memory occupied: 6883 MB.
|
63 |
+
GPU memory occupied: 6883 MB.
|
64 |
+
GPU memory occupied: 6883 MB.
|
65 |
+
GPU memory occupied: 6883 MB.
|
66 |
+
GPU memory occupied: 6883 MB.
|
67 |
+
GPU memory occupied: 6883 MB.
|
68 |
+
GPU memory occupied: 6883 MB.
|
69 |
+
[2023-02-20 17:06:12,424] [INFO] [logging.py:75:log_dist] [Rank 0] DeepSpeed info: version=0.8.1, git-hash=unknown, git-branch=unknown
|
70 |
+
[2023-02-20 17:06:14,006] [INFO] [logging.py:75:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False
|
71 |
+
Installed CUDA version 11.6 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination
|
72 |
+
Installed CUDA version 11.6 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination
|
73 |
+
Installed CUDA version 11.6 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination
|
74 |
+
Installed CUDA version 11.6 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination
|
75 |
+
Installed CUDA version 11.6 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination
|
76 |
+
Installed CUDA version 11.6 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination
|
77 |
+
Installed CUDA version 11.6 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination
|
78 |
+
Using /home/.cache/torch_extensions/py38_cu117 as PyTorch extensions root...
|
79 |
+
Using /home/.cache/torch_extensions/py38_cu117 as PyTorch extensions root...
|
80 |
+
Using /home/.cache/torch_extensions/py38_cu117 as PyTorch extensions root...
|
81 |
+
Using /home/.cache/torch_extensions/py38_cu117 as PyTorch extensions root...
|
82 |
+
Using /home/.cache/torch_extensions/py38_cu117 as PyTorch extensions root...
|
83 |
+
Detected CUDA files, patching ldflags
|
84 |
+
Emitting ninja build file /home/.cache/torch_extensions/py38_cu117/cpu_adam/build.ninja...
|
85 |
+
Building extension module cpu_adam...
|
86 |
+
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
|
87 |
+
Using /home/.cache/torch_extensions/py38_cu117 as PyTorch extensions root...
|
88 |
+
Using /home/.cache/torch_extensions/py38_cu117 as PyTorch extensions root...
|
89 |
+
ninja: no work to do.
|
90 |
+
Loading extension module cpu_adam...
|
91 |
+
Time to load cpu_adam op: 2.825831890106201 seconds
|
92 |
+
Loading extension module cpu_adam...
|
93 |
+
Time to load cpu_adam op: 2.6894984245300293 seconds
|
94 |
+
Loading extension module cpu_adam...
|
95 |
+
Time to load cpu_adam op: 2.815955877304077 seconds
|
96 |
+
Loading extension module cpu_adam...
|
97 |
+
Time to load cpu_adam op: 2.816244125366211 seconds
|
98 |
+
Loading extension module cpu_adam...
|
99 |
+
Time to load cpu_adam op: 2.7123100757598877 seconds
|
100 |
+
Loading extension module cpu_adam...
|
101 |
+
Time to load cpu_adam op: 2.8215184211730957 seconds
|
102 |
+
Loading extension module cpu_adam...
|
103 |
+
Time to load cpu_adam op: 2.789081573486328 seconds
|
104 |
+
Adam Optimizer #0 is created with AVX2 arithmetic capability.
|
105 |
+
Config: alpha=0.000050, betas=(0.900000, 0.999000), weight_decay=0.050000, adam_w=1
|
106 |
+
[2023-02-20 17:06:19,789] [INFO] [logging.py:75:log_dist] [Rank 0] Using DeepSpeed Optimizer param name adamw as basic optimizer
|
107 |
+
[2023-02-20 17:06:19,794] [INFO] [logging.py:75:log_dist] [Rank 0] DeepSpeed Basic Optimizer = DeepSpeedCPUAdam
|
108 |
+
[2023-02-20 17:06:19,795] [INFO] [utils.py:53:is_zero_supported_optimizer] Checking ZeRO support for optimizer=DeepSpeedCPUAdam type=<class 'deepspeed.ops.adam.cpu_adam.DeepSpeedCPUAdam'>
|
109 |
+
[2023-02-20 17:06:19,795] [INFO] [logging.py:75:log_dist] [Rank 0] Creating torch.float32 ZeRO stage 2 optimizer
|
110 |
+
[2023-02-20 17:06:19,795] [INFO] [stage_1_and_2.py:144:__init__] Reduce bucket size 500000000
|
111 |
+
[2023-02-20 17:06:19,795] [INFO] [stage_1_and_2.py:145:__init__] Allgather bucket size 500000000
|
112 |
+
[2023-02-20 17:06:19,795] [INFO] [stage_1_and_2.py:146:__init__] CPU Offload: True
|
113 |
+
[2023-02-20 17:06:19,795] [INFO] [stage_1_and_2.py:147:__init__] Round robin gradient partitioning: False
|
114 |
+
Using /home/.cache/torch_extensions/py38_cu117 as PyTorch extensions root...
|
115 |
+
Using /home/.cache/torch_extensions/py38_cu117 as PyTorch extensions root...
|
116 |
+
Using /home/.cache/torch_extensions/py38_cu117 as PyTorch extensions root...
|
117 |
+
Using /home/.cache/torch_extensions/py38_cu117 as PyTorch extensions root...
|
118 |
+
Using /home/.cache/torch_extensions/py38_cu117 as PyTorch extensions root...
|
119 |
+
Using /home/.cache/torch_extensions/py38_cu117 as PyTorch extensions root...
|
120 |
+
Using /home/.cache/torch_extensions/py38_cu117 as PyTorch extensions root...
|
121 |
+
Emitting ninja build file /home/.cache/torch_extensions/py38_cu117/utils/build.ninja...
|
122 |
+
Building extension module utils...
|
123 |
+
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
|
124 |
+
ninja: no work to do.
|
125 |
+
Loading extension module utils...
|
126 |
+
Time to load utils op: 0.34289026260375977 seconds
|
127 |
+
Loading extension module utils...
|
128 |
+
Time to load utils op: 0.2027883529663086 seconds
|
129 |
+
Loading extension module utils...
|
130 |
+
Loading extension module utils...
|
131 |
+
Time to load utils op: 0.2021796703338623 seconds
|
132 |
+
Loading extension module utils...
|
133 |
+
Time to load utils op: 0.2025763988494873 seconds
|
134 |
+
Loading extension module utils...
|
135 |
+
Time to load utils op: 0.2033846378326416 seconds
|
136 |
+
Time to load utils op: 0.2029557228088379 seconds
|
137 |
+
Loading extension module utils...
|
138 |
+
Time to load utils op: 0.30292582511901855 seconds
|
139 |
+
Rank: 6 partition count [7] and sizes[(17885514, False)]
|
140 |
+
Rank: 5 partition count [7] and sizes[(17885514, False)]
|
141 |
+
Rank: 4 partition count [7] and sizes[(17885514, False)]
|
142 |
+
Rank: 2 partition count [7] and sizes[(17885514, False)]
|
143 |
+
Rank: 3 partition count [7] and sizes[(17885514, False)]
|
144 |
+
Rank: 1 partition count [7] and sizes[(17885514, False)]
|
145 |
+
Rank: 0 partition count [7] and sizes[(17885514, False)]
|
146 |
+
[2023-02-20 17:06:27,470] [INFO] [utils.py:825:see_memory_usage] Before initializing optimizer states
|
147 |
+
[2023-02-20 17:06:27,471] [INFO] [utils.py:826:see_memory_usage] MA 0.66 GB Max_MA 0.66 GB CA 0.85 GB Max_CA 1 GB
|
148 |
+
[2023-02-20 17:06:27,471] [INFO] [utils.py:834:see_memory_usage] CPU Virtual Memory: used = 39.85 GB, percent = 7.9%
|
149 |
+
Using /home/.cache/torch_extensions/py38_cu117 as PyTorch extensions root...
|
150 |
+
No modifications detected for re-loaded extension module utils, skipping build step...
|
151 |
+
Loading extension module utils...
|
152 |
+
Time to load utils op: 0.00165557861328125 seconds
|
153 |
+
Using /home/.cache/torch_extensions/py38_cu117 as PyTorch extensions root...
|
154 |
+
No modifications detected for re-loaded extension module utils, skipping build step...
|
155 |
+
Loading extension module utils...
|
156 |
+
Time to load utils op: 0.008014678955078125 seconds
|
157 |
+
Using /home/.cache/torch_extensions/py38_cu117 as PyTorch extensions root...
|
158 |
+
Using /home/.cache/torch_extensions/py38_cu117 as PyTorch extensions root...
|
159 |
+
No modifications detected for re-loaded extension module utils, skipping build step...
|
160 |
+
Loading extension module utils...
|
161 |
+
No modifications detected for re-loaded extension module utils, skipping build step...
|
162 |
+
Loading extension module utils...
|
163 |
+
Time to load utils op: 0.03653693199157715 seconds
|
164 |
+
Using /home/.cache/torch_extensions/py38_cu117 as PyTorch extensions root...
|
165 |
+
No modifications detected for re-loaded extension module utils, skipping build step...
|
166 |
+
Loading extension module utils...
|
167 |
+
Time to load utils op: 0.008858203887939453 seconds
|
168 |
+
Using /home/.cache/torch_extensions/py38_cu117 as PyTorch extensions root...
|
169 |
+
No modifications detected for re-loaded extension module utils, skipping build step...
|
170 |
+
Loading extension module utils...
|
171 |
+
Time to load utils op: 0.0007452964782714844 seconds
|
172 |
+
Time to load utils op: 0.046510934829711914 seconds
|
173 |
+
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
|
174 |
+
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
|
175 |
+
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
|
176 |
+
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
|
177 |
+
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
|
178 |
+
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
|
179 |
+
[2023-02-20 17:06:28,120] [INFO] [utils.py:825:see_memory_usage] After initializing optimizer states
|
180 |
+
[2023-02-20 17:06:28,121] [INFO] [utils.py:826:see_memory_usage] MA 0.66 GB Max_MA 0.66 GB CA 0.85 GB Max_CA 1 GB
|
181 |
+
[2023-02-20 17:06:28,121] [INFO] [utils.py:834:see_memory_usage] CPU Virtual Memory: used = 40.4 GB, percent = 8.0%
|
182 |
+
[2023-02-20 17:06:28,121] [INFO] [stage_1_and_2.py:527:__init__] optimizer state initialized
|
183 |
+
[2023-02-20 17:06:28,222] [INFO] [utils.py:825:see_memory_usage] After initializing ZeRO optimizer
|
184 |
+
[2023-02-20 17:06:28,222] [INFO] [utils.py:826:see_memory_usage] MA 0.66 GB Max_MA 0.66 GB CA 0.85 GB Max_CA 1 GB
|
185 |
+
[2023-02-20 17:06:28,223] [INFO] [utils.py:834:see_memory_usage] CPU Virtual Memory: used = 40.4 GB, percent = 8.0%
|
186 |
+
[2023-02-20 17:06:28,223] [INFO] [logging.py:75:log_dist] [Rank 0] DeepSpeed Final Optimizer = adamw
|
187 |
+
[2023-02-20 17:06:28,223] [INFO] [logging.py:75:log_dist] [Rank 0] DeepSpeed using configured LR scheduler = WarmupLR
|
188 |
+
[2023-02-20 17:06:28,223] [INFO] [logging.py:75:log_dist] [Rank 0] DeepSpeed LR Scheduler = <deepspeed.runtime.lr_schedules.WarmupLR object at 0x7efce1a33f40>
|
189 |
+
[2023-02-20 17:06:28,224] [INFO] [logging.py:75:log_dist] [Rank 0] step=0, skipped=0, lr=[5e-05], mom=[[0.9, 0.999]]
|
190 |
+
[2023-02-20 17:06:28,226] [INFO] [config.py:1009:print] DeepSpeedEngine configuration:
|
191 |
+
[2023-02-20 17:06:28,226] [INFO] [config.py:1013:print] activation_checkpointing_config {
|
192 |
+
"partition_activations": false,
|
193 |
+
"contiguous_memory_optimization": false,
|
194 |
+
"cpu_checkpointing": false,
|
195 |
+
"number_checkpoints": null,
|
196 |
+
"synchronize_checkpoint_boundary": false,
|
197 |
+
"profile": false
|
198 |
+
}
|
199 |
+
[2023-02-20 17:06:28,226] [INFO] [config.py:1013:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True}
|
200 |
+
[2023-02-20 17:06:28,226] [INFO] [config.py:1013:print] amp_enabled .................. False
|
201 |
+
[2023-02-20 17:06:28,226] [INFO] [config.py:1013:print] amp_params ................... False
|
202 |
+
[2023-02-20 17:06:28,226] [INFO] [config.py:1013:print] autotuning_config ............ {
|
203 |
+
"enabled": false,
|
204 |
+
"start_step": null,
|
205 |
+
"end_step": null,
|
206 |
+
"metric_path": null,
|
207 |
+
"arg_mappings": null,
|
208 |
+
"metric": "throughput",
|
209 |
+
"model_info": null,
|
210 |
+
"results_dir": "autotuning_results",
|
211 |
+
"exps_dir": "autotuning_exps",
|
212 |
+
"overwrite": true,
|
213 |
+
"fast": true,
|
214 |
+
"start_profile_step": 3,
|
215 |
+
"end_profile_step": 5,
|
216 |
+
"tuner_type": "gridsearch",
|
217 |
+
"tuner_early_stopping": 5,
|
218 |
+
"tuner_num_trials": 50,
|
219 |
+
"model_info_path": null,
|
220 |
+
"mp_size": 1,
|
221 |
+
"max_train_batch_size": null,
|
222 |
+
"min_train_batch_size": 1,
|
223 |
+
"max_train_micro_batch_size_per_gpu": 1.024000e+03,
|
224 |
+
"min_train_micro_batch_size_per_gpu": 1,
|
225 |
+
"num_tuning_micro_batch_sizes": 3
|
226 |
+
}
|
227 |
+
[2023-02-20 17:06:28,226] [INFO] [config.py:1013:print] bfloat16_enabled ............. False
|
228 |
+
[2023-02-20 17:06:28,226] [INFO] [config.py:1013:print] checkpoint_parallel_write_pipeline False
|
229 |
+
[2023-02-20 17:06:28,226] [INFO] [config.py:1013:print] checkpoint_tag_validation_enabled True
|
230 |
+
[2023-02-20 17:06:28,226] [INFO] [config.py:1013:print] checkpoint_tag_validation_fail False
|
231 |
+
[2023-02-20 17:06:28,226] [INFO] [config.py:1013:print] comms_config ................. <deepspeed.comm.config.DeepSpeedCommsConfig object at 0x7efd0eb74250>
|
232 |
+
[2023-02-20 17:06:28,226] [INFO] [config.py:1013:print] communication_data_type ...... None
|
233 |
+
[2023-02-20 17:06:28,227] [INFO] [config.py:1013:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}}
|
234 |
+
[2023-02-20 17:06:28,227] [INFO] [config.py:1013:print] curriculum_enabled_legacy .... False
|
235 |
+
[2023-02-20 17:06:28,227] [INFO] [config.py:1013:print] curriculum_params_legacy ..... False
|
236 |
+
[2023-02-20 17:06:28,227] [INFO] [config.py:1013:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}}
|
237 |
+
[2023-02-20 17:06:28,227] [INFO] [config.py:1013:print] data_efficiency_enabled ...... False
|
238 |
+
[2023-02-20 17:06:28,227] [INFO] [config.py:1013:print] dataloader_drop_last ......... False
|
239 |
+
[2023-02-20 17:06:28,227] [INFO] [config.py:1013:print] disable_allgather ............ False
|
240 |
+
[2023-02-20 17:06:28,227] [INFO] [config.py:1013:print] dump_state ................... False
|
241 |
+
[2023-02-20 17:06:28,227] [INFO] [config.py:1013:print] dynamic_loss_scale_args ...... None
|
242 |
+
[2023-02-20 17:06:28,227] [INFO] [config.py:1013:print] eigenvalue_enabled ........... False
|
243 |
+
[2023-02-20 17:06:28,227] [INFO] [config.py:1013:print] eigenvalue_gas_boundary_resolution 1
|
244 |
+
[2023-02-20 17:06:28,227] [INFO] [config.py:1013:print] eigenvalue_layer_name ........ bert.encoder.layer
|
245 |
+
[2023-02-20 17:06:28,227] [INFO] [config.py:1013:print] eigenvalue_layer_num ......... 0
|
246 |
+
[2023-02-20 17:06:28,227] [INFO] [config.py:1013:print] eigenvalue_max_iter .......... 100
|
247 |
+
[2023-02-20 17:06:28,227] [INFO] [config.py:1013:print] eigenvalue_stability ......... 1e-06
|
248 |
+
[2023-02-20 17:06:28,227] [INFO] [config.py:1013:print] eigenvalue_tol ............... 0.01
|
249 |
+
[2023-02-20 17:06:28,227] [INFO] [config.py:1013:print] eigenvalue_verbose ........... False
|
250 |
+
[2023-02-20 17:06:28,227] [INFO] [config.py:1013:print] elasticity_enabled ........... False
|
251 |
+
[2023-02-20 17:06:28,227] [INFO] [config.py:1013:print] flops_profiler_config ........ {
|
252 |
+
"enabled": false,
|
253 |
+
"profile_step": 1,
|
254 |
+
"module_depth": -1,
|
255 |
+
"top_modules": 1,
|
256 |
+
"detailed": true,
|
257 |
+
"output_file": null
|
258 |
+
}
|
259 |
+
[2023-02-20 17:06:28,227] [INFO] [config.py:1013:print] fp16_auto_cast ............... None
|
260 |
+
[2023-02-20 17:06:28,227] [INFO] [config.py:1013:print] fp16_enabled ................. False
|
261 |
+
[2023-02-20 17:06:28,227] [INFO] [config.py:1013:print] fp16_master_weights_and_gradients False
|
262 |
+
[2023-02-20 17:06:28,227] [INFO] [config.py:1013:print] global_rank .................. 0
|
263 |
+
[2023-02-20 17:06:28,227] [INFO] [config.py:1013:print] grad_accum_dtype ............. None
|
264 |
+
[2023-02-20 17:06:28,227] [INFO] [config.py:1013:print] gradient_accumulation_steps .. 64
|
265 |
+
[2023-02-20 17:06:28,227] [INFO] [config.py:1013:print] gradient_clipping ............ 1.0
|
266 |
+
[2023-02-20 17:06:28,227] [INFO] [config.py:1013:print] gradient_predivide_factor .... 1.0
|
267 |
+
[2023-02-20 17:06:28,227] [INFO] [config.py:1013:print] initial_dynamic_scale ........ 65536
|
268 |
+
[2023-02-20 17:06:28,227] [INFO] [config.py:1013:print] load_universal_checkpoint .... False
|
269 |
+
[2023-02-20 17:06:28,227] [INFO] [config.py:1013:print] loss_scale ................... 0
|
270 |
+
[2023-02-20 17:06:28,227] [INFO] [config.py:1013:print] memory_breakdown ............. False
|
271 |
+
[2023-02-20 17:06:28,227] [INFO] [config.py:1013:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=True, output_path='logs/', job_name='train_neo') wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') enabled=True
|
272 |
+
[2023-02-20 17:06:28,227] [INFO] [config.py:1013:print] nebula_config ................ {
|
273 |
+
"enabled": false,
|
274 |
+
"persistent_storage_path": null,
|
275 |
+
"persistent_time_interval": 100,
|
276 |
+
"num_of_version_in_retention": 2,
|
277 |
+
"enable_nebula_load": true,
|
278 |
+
"load_path": null
|
279 |
+
}
|
280 |
+
[2023-02-20 17:06:28,227] [INFO] [config.py:1013:print] optimizer_legacy_fusion ...... False
|
281 |
+
[2023-02-20 17:06:28,228] [INFO] [config.py:1013:print] optimizer_name ............... adamw
|
282 |
+
[2023-02-20 17:06:28,228] [INFO] [config.py:1013:print] optimizer_params ............. {'lr': 5e-05, 'betas': [0.9, 0.999], 'eps': 1e-08, 'weight_decay': 0.05}
|
283 |
+
[2023-02-20 17:06:28,228] [INFO] [config.py:1013:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0}
|
284 |
+
[2023-02-20 17:06:28,228] [INFO] [config.py:1013:print] pld_enabled .................. False
|
285 |
+
[2023-02-20 17:06:28,228] [INFO] [config.py:1013:print] pld_params ................... False
|
286 |
+
[2023-02-20 17:06:28,228] [INFO] [config.py:1013:print] prescale_gradients ........... False
|
287 |
+
[2023-02-20 17:06:28,228] [INFO] [config.py:1013:print] scheduler_name ............... WarmupLR
|
288 |
+
[2023-02-20 17:06:28,228] [INFO] [config.py:1013:print] scheduler_params ............. {'warmup_min_lr': 0, 'warmup_max_lr': 5e-05, 'warmup_num_steps': 500}
|
289 |
+
[2023-02-20 17:06:28,228] [INFO] [config.py:1013:print] sparse_attention ............. None
|
290 |
+
[2023-02-20 17:06:28,228] [INFO] [config.py:1013:print] sparse_gradients_enabled ..... False
|
291 |
+
[2023-02-20 17:06:28,228] [INFO] [config.py:1013:print] steps_per_print .............. 2000
|
292 |
+
[2023-02-20 17:06:28,228] [INFO] [config.py:1013:print] train_batch_size ............. 2688
|
293 |
+
[2023-02-20 17:06:28,228] [INFO] [config.py:1013:print] train_micro_batch_size_per_gpu 6
|
294 |
+
[2023-02-20 17:06:28,228] [INFO] [config.py:1013:print] use_node_local_storage ....... False
|
295 |
+
[2023-02-20 17:06:28,228] [INFO] [config.py:1013:print] wall_clock_breakdown ......... False
|
296 |
+
[2023-02-20 17:06:28,228] [INFO] [config.py:1013:print] world_size ................... 7
|
297 |
+
[2023-02-20 17:06:28,228] [INFO] [config.py:1013:print] zero_allow_untested_optimizer True
|
298 |
+
[2023-02-20 17:06:28,228] [INFO] [config.py:1013:print] zero_config .................. stage=2 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=DeepSpeedZeroOffloadOptimizerConfig(device='cpu', nvme_path=None, buffer_count=4, pin_memory=True, pipeline=False, pipeline_read=False, pipeline_write=False, fast_init=False) sub_group_size=1,000,000,000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50,000,000 param_persistence_threshold=100,000 model_persistence_threshold=sys.maxsize max_live_parameters=1,000,000,000 max_reuse_distance=1,000,000,000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False
|
299 |
+
[2023-02-20 17:06:28,228] [INFO] [config.py:1013:print] zero_enabled ................. True
|
300 |
+
[2023-02-20 17:06:28,228] [INFO] [config.py:1013:print] zero_optimization_stage ...... 2
|
301 |
+
[2023-02-20 17:06:28,228] [INFO] [config.py:998:print_user_config] json = {
|
302 |
+
"optimizer": {
|
303 |
+
"type": "AdamW",
|
304 |
+
"params": {
|
305 |
+
"lr": 5e-05,
|
306 |
+
"betas": [0.9, 0.999],
|
307 |
+
"eps": 1e-08,
|
308 |
+
"weight_decay": 0.05
|
309 |
+
}
|
310 |
+
},
|
311 |
+
"scheduler": {
|
312 |
+
"type": "WarmupLR",
|
313 |
+
"params": {
|
314 |
+
"warmup_min_lr": 0,
|
315 |
+
"warmup_max_lr": 5e-05,
|
316 |
+
"warmup_num_steps": 500
|
317 |
+
}
|
318 |
+
},
|
319 |
+
"zero_optimization": {
|
320 |
+
"stage": 2,
|
321 |
+
"offload_optimizer": {
|
322 |
+
"device": "cpu",
|
323 |
+
"pin_memory": true
|
324 |
+
},
|
325 |
+
"allgather_partitions": true,
|
326 |
+
"allgather_bucket_size": 5.000000e+08,
|
327 |
+
"overlap_comm": true,
|
328 |
+
"reduce_scatter": true,
|
329 |
+
"reduce_bucket_size": 5.000000e+08,
|
330 |
+
"contiguous_gradients": true
|
331 |
+
},
|
332 |
+
"tensorboard": {
|
333 |
+
"enabled": true,
|
334 |
+
"output_path": "logs/",
|
335 |
+
"job_name": "train_neo"
|
336 |
+
},
|
337 |
+
"zero_allow_untested_optimizer": true,
|
338 |
+
"gradient_accumulation_steps": 64,
|
339 |
+
"gradient_clipping": 1.0,
|
340 |
+
"steps_per_print": 2.000000e+03,
|
341 |
+
"train_batch_size": 2.688000e+03,
|
342 |
+
"train_micro_batch_size_per_gpu": 6,
|
343 |
+
"wall_clock_breakdown": false
|
344 |
+
}
|
345 |
+
Using /home/.cache/torch_extensions/py38_cu117 as PyTorch extensions root...
|
346 |
+
No modifications detected for re-loaded extension module utils, skipping build step...
|
347 |
+
Loading extension module utils...
|
348 |
+
Time to load utils op: 0.00042748451232910156 seconds
|
349 |
+
***** Running training *****
|
350 |
+
Num examples = 117232
|
351 |
+
Num Epochs = 10
|
352 |
+
Instantaneous batch size per device = 6
|
353 |
+
Total train batch size (w. parallel, distributed & accumulation) = 2688
|
354 |
+
Gradient Accumulation steps = 64
|
355 |
+
Total optimization steps = 430
|
356 |
+
Number of trainable parameters = 125198592
|
357 |
+
|
358 |
0%| | 0/430 [00:00<?, ?it/s]You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
|
359 |
+
|
360 |
0%| | 1/430 [01:21<9:41:43, 81.36s/it]
|
361 |
|
362 |
+
|
363 |
0%| | 1/430 [01:21<9:41:43, 81.36s/it]
|
364 |
0%| | 2/430 [02:41<9:36:10, 80.77s/it]
|
365 |
1%| | 3/430 [04:02<9:33:33, 80.59s/it]
|
366 |
1%| | 4/430 [05:22<9:31:45, 80.53s/it]
|
367 |
1%| | 5/430 [06:42<9:30:07, 80.49s/it]
|
368 |
|
369 |
+
|
370 |
1%| | 5/430 [06:42<9:30:07, 80.49s/it]
|
371 |
1%|β | 6/430 [08:03<9:28:34, 80.46s/it]
|
372 |
2%|β | 7/430 [09:24<9:28:07, 80.58s/it]
|
373 |
2%|β | 8/430 [10:44<9:26:23, 80.53s/it]
|
374 |
2%|β | 9/430 [12:05<9:24:58, 80.52s/it]
|
375 |
2%|β | 10/430 [13:25<9:23:23, 80.49s/it]
|
376 |
|
377 |
+
|
378 |
2%|β | 10/430 [13:25<9:23:23, 80.49s/it]
|
379 |
3%|β | 11/430 [14:45<9:22:02, 80.48s/it]
|
380 |
3%|β | 12/430 [16:06<9:20:48, 80.50s/it]
|
381 |
3%|β | 13/430 [17:26<9:19:19, 80.48s/it]
|
382 |
3%|β | 14/430 [18:47<9:18:17, 80.52s/it]
|
383 |
3%|β | 15/430 [20:08<9:16:48, 80.50s/it]
|
384 |
|
385 |
+
|
386 |
3%|β | 15/430 [20:08<9:16:48, 80.50s/it]
|
387 |
4%|β | 16/430 [21:28<9:15:34, 80.52s/it]
|
388 |
4%|β | 17/430 [22:49<9:14:08, 80.51s/it]
|
389 |
4%|β | 18/430 [24:09<9:12:33, 80.47s/it]
|
390 |
4%|β | 19/430 [25:30<9:11:22, 80.49s/it]
|
391 |
5%|β | 20/430 [26:50<9:09:51, 80.47s/it]
|
392 |
|
393 |
+
|
394 |
5%|β | 20/430 [26:50<9:09:51, 80.47s/it]
|
395 |
5%|β | 21/430 [28:10<9:08:25, 80.45s/it]
|
396 |
5%|β | 22/430 [29:31<9:08:02, 80.59s/it]
|
397 |
5%|β | 23/430 [30:52<9:06:16, 80.53s/it]
|
398 |
6%|β | 24/430 [32:12<9:04:45, 80.51s/it]
|
399 |
6%|β | 25/430 [33:32<9:03:06, 80.46s/it]
|
400 |
|
401 |
+
|
402 |
6%|β | 25/430 [33:32<9:03:06, 80.46s/it]
|
403 |
6%|β | 26/430 [34:53<9:01:27, 80.42s/it]
|
404 |
6%|β | 27/430 [36:13<8:59:58, 80.39s/it]
|
405 |
7%|β | 28/430 [37:33<8:58:26, 80.36s/it]
|
406 |
7%|β | 29/430 [38:54<8:57:43, 80.46s/it]
|
407 |
7%|β | 30/430 [40:14<8:56:19, 80.45s/it]
|
408 |
|
409 |
+
|
410 |
7%|β | 30/430 [40:14<8:56:19, 80.45s/it]
|
411 |
7%|β | 31/430 [41:35<8:55:05, 80.46s/it]
|
412 |
7%|β | 32/430 [42:55<8:53:43, 80.46s/it]
|
413 |
8%|β | 33/430 [44:16<8:52:17, 80.45s/it]
|
414 |
8%|β | 34/430 [45:36<8:50:56, 80.45s/it]
|
415 |
8%|β | 35/430 [46:57<8:49:37, 80.45s/it]
|
416 |
|
417 |
+
|
418 |
8%|β | 35/430 [46:57<8:49:37, 80.45s/it]
|
419 |
8%|β | 36/430 [48:17<8:48:22, 80.46s/it]
|
420 |
9%|β | 37/430 [49:38<8:46:59, 80.46s/it]
|
421 |
9%|β | 38/430 [50:58<8:45:38, 80.46s/it]
|
422 |
9%|β | 39/430 [52:19<8:44:24, 80.47s/it]
|
423 |
9%|β | 40/430 [53:39<8:43:03, 80.47s/it]
|
424 |
|
425 |
+
|
426 |
9%|β | 40/430 [53:39<8:43:03, 80.47s/it]
|
427 |
10%|β | 41/430 [55:00<8:42:48, 80.64s/it]
|
428 |
10%|β | 42/430 [56:21<8:41:07, 80.59s/it]
|
429 |
10%|β | 43/430 [57:41<8:39:31, 80.55s/it]
|
430 |
10%|β | 44/430 [59:50<10:12:21, 95.19s/it]
|
431 |
10%|β | 45/430 [1:01:11<9:42:25, 90.77s/it]
|
432 |
|
433 |
+
|
434 |
10%|β | 45/430 [1:01:11<9:42:25, 90.77s/it]
|
435 |
11%|β | 46/430 [1:02:31<9:21:05, 87.67s/it]
|
436 |
11%|β | 47/430 [1:03:52<9:06:42, 85.65s/it]
|
437 |
11%|β | 48/430 [1:05:13<8:55:45, 84.15s/it]
|
438 |
11%|ββ | 49/430 [1:06:33<8:47:15, 83.03s/it]
|
439 |
12%|ββ | 50/430 [1:07:54<8:41:00, 82.27s/it]
|
440 |
|
441 |
+
|
442 |
12%|ββ | 50/430 [1:07:54<8:41:00, 82.27s/it]
|
443 |
12%|ββ | 51/430 [1:09:14<8:36:14, 81.73s/it]
|
444 |
12%|ββ | 52/430 [1:10:35<8:32:33, 81.36s/it]
|
445 |
12%|ββ | 53/430 [1:11:55<8:30:00, 81.17s/it]
|
446 |
13%|ββ | 54/430 [1:13:16<8:27:22, 80.96s/it]
|
447 |
13%|ββ | 55/430 [1:14:37<8:25:21, 80.86s/it]
|
448 |
|
449 |
+
|
450 |
13%|ββ | 55/430 [1:14:37<8:25:21, 80.86s/it]
|
451 |
13%|ββ | 56/430 [1:15:57<8:23:29, 80.77s/it]
|
452 |
13%|ββ | 57/430 [1:17:18<8:21:34, 80.68s/it]
|
453 |
13%|ββ | 58/430 [1:18:38<8:19:56, 80.63s/it]
|
454 |
14%|ββ | 59/430 [1:19:59<8:18:24, 80.60s/it]
|
455 |
14%|ββ | 60/430 [1:21:19<8:16:44, 80.55s/it]
|
456 |
|
457 |
+
|
458 |
14%|ββ | 60/430 [1:21:19<8:16:44, 80.55s/it]
|
459 |
14%|ββ | 61/430 [1:22:40<8:15:19, 80.54s/it]
|
460 |
14%|ββ | 62/430 [1:24:00<8:13:53, 80.53s/it]
|
461 |
15%|ββ | 63/430 [1:25:21<8:12:48, 80.57s/it]
|
462 |
15%|ββ | 64/430 [1:26:41<8:11:20, 80.55s/it]
|
463 |
15%|ββ | 65/430 [1:28:02<8:09:50, 80.52s/it]
|
464 |
|
465 |
+
|
466 |
15%|ββ | 65/430 [1:28:02<8:09:50, 80.52s/it]
|
467 |
15%|ββ | 66/430 [1:29:22<8:08:31, 80.52s/it]
|
468 |
16%|ββ | 67/430 [1:30:43<8:07:03, 80.51s/it]
|
469 |
16%|ββ | 68/430 [1:32:03<8:05:39, 80.50s/it]
|
470 |
16%|ββ | 69/430 [1:33:24<8:04:17, 80.49s/it]
|
471 |
16%|ββ | 70/430 [1:34:44<8:02:50, 80.47s/it]
|
472 |
|
473 |
+
|
474 |
16%|ββ | 70/430 [1:34:44<8:02:50, 80.47s/it]
|
475 |
17%|ββ | 71/430 [1:36:05<8:02:07, 80.58s/it]
|
476 |
17%|ββ | 72/430 [1:37:25<8:00:27, 80.52s/it]
|
477 |
17%|ββ | 73/430 [1:38:46<7:58:58, 80.50s/it]
|
478 |
17%|ββ | 74/430 [1:40:06<7:57:28, 80.47s/it]
|
479 |
17%|ββ | 75/430 [1:41:27<7:56:03, 80.46s/it]
|
480 |
|
481 |
+
|
482 |
17%|ββ | 75/430 [1:41:27<7:56:03, 80.46s/it]
|
483 |
18%|ββ | 76/430 [1:42:47<7:54:59, 80.51s/it]
|
484 |
18%|ββ | 77/430 [1:44:08<7:53:37, 80.50s/it]
|
485 |
18%|ββ | 78/430 [1:45:28<7:52:21, 80.52s/it]
|
486 |
18%|ββ | 79/430 [1:46:49<7:50:59, 80.51s/it]
|
487 |
19%|ββ | 80/430 [1:48:09<7:49:36, 80.51s/it]
|
488 |
|
489 |
+
|
490 |
19%|ββ | 80/430 [1:48:09<7:49:36, 80.51s/it]
|
491 |
19%|ββ | 81/430 [1:49:30<7:48:23, 80.53s/it]
|
492 |
19%|ββ | 82/430 [1:50:50<7:47:05, 80.53s/it]
|
493 |
19%|ββ | 83/430 [1:52:11<7:45:34, 80.50s/it]
|
494 |
20%|ββ | 84/430 [1:53:32<7:45:20, 80.69s/it]
|
495 |
20%|ββ | 85/430 [1:54:52<7:43:34, 80.62s/it]
|
496 |
|
497 |
+
|
498 |
20%|ββ | 85/430 [1:54:52<7:43:34, 80.62s/it]
|
499 |
20%|ββ | 86/430 [1:56:13<7:42:01, 80.59s/it]
|
500 |
20%|ββ | 87/430 [1:58:23<9:05:42, 95.46s/it]
|
501 |
20%|ββ | 88/430 [1:59:44<8:38:35, 90.98s/it]
|
502 |
21%|ββ | 89/430 [2:01:04<8:19:08, 87.82s/it]
|
503 |
21%|ββ | 90/430 [2:02:25<8:05:18, 85.64s/it]
|
504 |
|
505 |
+
|
506 |
21%|ββ | 90/430 [2:02:25<8:05:18, 85.64s/it]
|
507 |
21%|ββ | 91/430 [2:03:45<7:55:07, 84.09s/it]
|
508 |
21%|βββ | 92/430 [2:05:06<7:47:38, 83.01s/it]
|
509 |
22%|βββ | 93/430 [2:06:26<7:42:05, 82.27s/it]
|
510 |
22%|βββ | 94/430 [2:07:47<7:37:48, 81.75s/it]
|
511 |
22%|βββ | 95/430 [2:09:07<7:34:41, 81.44s/it]
|
512 |
|
513 |
+
|
514 |
22%|βββ | 95/430 [2:09:07<7:34:41, 81.44s/it]
|
515 |
22%|βββ | 96/430 [2:10:28<7:31:43, 81.15s/it]
|
516 |
23%|βββ | 97/430 [2:11:48<7:29:16, 80.95s/it]
|
517 |
23%|βββ | 98/430 [2:13:09<7:27:04, 80.80s/it]
|
518 |
23%|βββ | 99/430 [2:14:29<7:25:22, 80.73s/it]
|
519 |
23%|βββ | 100/430 [2:15:50<7:23:40, 80.67s/it]
|
520 |
|
521 |
+
|
522 |
23%|βββ | 100/430 [2:15:50<7:23:40, 80.67s/it]
|
523 |
23%|βββ | 101/430 [2:17:10<7:22:04, 80.62s/it]
|
524 |
24%|βββ | 102/430 [2:18:31<7:20:28, 80.58s/it]
|
525 |
24%|βββ | 103/430 [2:19:51<7:18:57, 80.54s/it]
|
526 |
24%|βββ | 104/430 [2:21:12<7:17:41, 80.56s/it]
|
527 |
24%|βββ | 105/430 [2:22:32<7:16:17, 80.55s/it]
|
528 |
|
529 |
+
|
530 |
24%|βββ | 105/430 [2:22:32<7:16:17, 80.55s/it]
|
531 |
25%|βββ | 106/430 [2:23:53<7:15:01, 80.56s/it]
|
532 |
25%|βββ | 107/430 [2:25:14<7:13:51, 80.59s/it]
|
533 |
25%|βββ | 108/430 [2:26:34<7:12:27, 80.58s/it]
|
534 |
25%|βββ | 109/430 [2:27:55<7:11:11, 80.60s/it]
|
535 |
26%|βββ | 110/430 [2:29:15<7:09:47, 80.59s/it]
|
536 |
|
537 |
+
|
538 |
26%|βββ | 110/430 [2:29:15<7:09:47, 80.59s/it]
|
539 |
26%|βββ | 111/430 [2:30:36<7:08:21, 80.57s/it]
|
540 |
26%|βββ | 112/430 [2:31:57<7:07:20, 80.63s/it]
|
541 |
26%|βββ | 113/430 [2:33:17<7:05:49, 80.60s/it]
|
542 |
27%|βββ | 114/430 [2:34:38<7:04:22, 80.58s/it]
|
543 |
27%|βββ | 115/430 [2:35:58<7:02:59, 80.57s/it]
|
544 |
|
545 |
+
|
546 |
27%|βββ | 115/430 [2:35:58<7:02:59, 80.57s/it]
|
547 |
27%|βββ | 116/430 [2:37:20<7:02:34, 80.75s/it]
|
548 |
27%|βββ | 117/430 [2:38:40<7:00:55, 80.69s/it]
|
549 |
27%|βββ | 118/430 [2:40:01<6:59:23, 80.65s/it]
|
550 |
28%|βββ | 119/430 [2:41:21<6:57:54, 80.63s/it]
|
551 |
28%|βββ | 120/430 [2:42:42<6:56:26, 80.60s/it]
|
552 |
|
553 |
+
|
554 |
28%|βββ | 120/430 [2:42:42<6:56:26, 80.60s/it]
|
555 |
28%|βββ | 121/430 [2:44:02<6:55:00, 80.58s/it]
|
556 |
28%|βββ | 122/430 [2:45:23<6:53:30, 80.55s/it]
|
557 |
29%|βββ | 123/430 [2:46:43<6:52:17, 80.58s/it]
|
558 |
29%|βββ | 124/430 [2:48:04<6:51:04, 80.60s/it]
|
559 |
29%|βββ | 125/430 [2:49:25<6:49:38, 80.59s/it]
|
560 |
|
561 |
+
|
562 |
29%|βββ | 125/430 [2:49:25<6:49:38, 80.59s/it]
|
563 |
29%|βββ | 126/430 [2:50:45<6:48:24, 80.61s/it]
|
564 |
30%|βββ | 127/430 [2:52:06<6:47:00, 80.60s/it]
|
565 |
30%|βββ | 128/430 [2:53:27<6:46:32, 80.77s/it]
|
566 |
30%|βββ | 129/430 [2:54:48<6:44:56, 80.72s/it]
|
567 |
30%|βββ | 130/430 [2:56:57<7:56:43, 95.35s/it]
|
568 |
|
569 |
+
|
570 |
30%|βββ | 130/430 [2:56:57<7:56:43, 95.35s/it]
|
571 |
30%|βββ | 131/430 [2:58:18<7:32:58, 90.90s/it]
|
572 |
31%|βββ | 132/430 [2:59:38<7:15:53, 87.76s/it]
|
573 |
31%|βββ | 133/430 [3:00:59<7:03:46, 85.61s/it]
|
574 |
31%|βββ | 134/430 [3:02:19<6:54:52, 84.10s/it]
|
575 |
31%|ββββ | 135/430 [3:03:40<6:48:13, 83.03s/it]
|
576 |
|
577 |
+
|
578 |
31%|ββββ | 135/430 [3:03:40<6:48:13, 83.03s/it]
|
579 |
32%|ββββ | 136/430 [3:05:00<6:43:08, 82.27s/it]
|
580 |
32%|ββββ | 137/430 [3:06:21<6:39:15, 81.76s/it]
|
581 |
32%|ββββ | 138/430 [3:07:41<6:36:06, 81.39s/it]
|
582 |
32%|ββββ | 139/430 [3:09:02<6:33:50, 81.20s/it]
|
583 |
33%|ββββ | 140/430 [3:10:23<6:31:28, 81.00s/it]
|
584 |
|
585 |
+
|
586 |
33%|ββββ | 140/430 [3:10:23<6:31:28, 81.00s/it]
|
587 |
33%|ββββ | 141/430 [3:11:43<6:29:27, 80.86s/it]
|
588 |
33%|ββββ | 142/430 [3:13:04<6:27:32, 80.74s/it]
|
589 |
33%|ββββ | 143/430 [3:14:24<6:26:05, 80.72s/it]
|
590 |
33%|ββββ | 144/430 [3:15:45<6:24:25, 80.65s/it]
|
591 |
34%|ββββ | 145/430 [3:17:05<6:22:43, 80.57s/it]
|
592 |
|
593 |
+
|
594 |
34%|ββββ | 145/430 [3:17:05<6:22:43, 80.57s/it]
|
595 |
34%|ββββ | 146/430 [3:18:26<6:21:11, 80.53s/it]
|
596 |
34%|ββββ | 147/430 [3:19:46<6:19:49, 80.53s/it]
|
597 |
34%|ββββ | 148/430 [3:21:07<6:18:31, 80.54s/it]
|
598 |
35%|ββββ | 149/430 [3:22:27<6:17:16, 80.56s/it]
|
599 |
35%|ββββ | 150/430 [3:23:48<6:15:53, 80.55s/it]
|
600 |
|
601 |
+
|
602 |
35%|ββββ | 150/430 [3:23:48<6:15:53, 80.55s/it]Saving model checkpoint to ./results/checkpoint-150
|
603 |
+
Configuration saved in ./results/checkpoint-150/config.json
|
604 |
+
Model weights saved in ./results/checkpoint-150/pytorch_model.bin
|
605 |
+
tokenizer config file saved in ./results/checkpoint-150/tokenizer_config.json
|
606 |
+
Special tokens file saved in ./results/checkpoint-150/special_tokens_map.json
|
607 |
+
[2023-02-20 20:30:17,356] [INFO] [logging.py:75:log_dist] [Rank 0] [Torch] Checkpoint global_step151 is begin to save!
|
608 |
+
/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py:1428: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
|
609 |
+
warnings.warn(
|
610 |
+
/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py:1428: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
|
611 |
+
warnings.warn(
|
612 |
+
/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py:1428: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
|
613 |
+
warnings.warn(
|
614 |
+
/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py:1428: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
|
615 |
+
warnings.warn(
|
616 |
+
/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py:1428: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
|
617 |
+
warnings.warn(
|
618 |
+
/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py:1428: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
|
619 |
+
warnings.warn(
|
620 |
+
/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py:1428: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.
|
621 |
+
warnings.warn(
|
622 |
+
[2023-02-20 20:30:17,359] [INFO] [logging.py:75:log_dist] [Rank 0] Saving model checkpoint: ./results/checkpoint-150/global_step151/mp_rank_00_model_states.pt
|
623 |
+
[2023-02-20 20:30:17,359] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving ./results/checkpoint-150/global_step151/mp_rank_00_model_states.pt...
|
624 |
+
[2023-02-20 20:30:18,018] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved ./results/checkpoint-150/global_step151/mp_rank_00_model_states.pt.
|
625 |
+
[2023-02-20 20:30:18,019] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving ./results/checkpoint-150/global_step151/zero_pp_rank_0_mp_rank_00_optim_states.pt...
|
626 |
+
[2023-02-20 20:30:18,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved ./results/checkpoint-150/global_step151/zero_pp_rank_0_mp_rank_00_optim_states.pt.
|
627 |
+
[2023-02-20 20:30:18,223] [INFO] [engine.py:3407:_save_zero_checkpoint] zero checkpoint saved ./results/checkpoint-150/global_step151/zero_pp_rank_0_mp_rank_00_optim_states.pt
|
628 |
+
[2023-02-20 20:30:18,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step151 is ready now!
|
629 |
+
Deleting older checkpoint [results/checkpoint-45] due to args.save_total_limit
|
630 |
+
|
631 |
35%|ββββ | 151/430 [3:25:10<6:16:59, 81.07s/it]
|
632 |
35%|ββββ | 152/430 [3:26:31<6:14:42, 80.87s/it]
|
633 |
36%|ββββ | 153/430 [3:27:51<6:12:50, 80.76s/it]
|
634 |
36%|ββββ | 154/430 [3:29:11<6:11:03, 80.67s/it]
|
635 |
36%|ββββ | 155/430 [3:30:32<6:09:25, 80.60s/it]
|
636 |
|
637 |
+
|
638 |
36%|ββββ | 155/430 [3:30:32<6:09:25, 80.60s/it]
|
639 |
36%|ββββ | 156/430 [3:31:52<6:07:55, 80.57s/it]
|
640 |
37%|ββββ | 157/430 [3:33:13<6:07:08, 80.69s/it]
|
641 |
37%|ββββ | 158/430 [3:34:34<6:05:33, 80.64s/it]
|
642 |
37%|ββββ | 159/430 [3:35:54<6:04:05, 80.61s/it]
|
643 |
37%|ββββ | 160/430 [3:37:15<6:02:37, 80.58s/it]
|
644 |
|
645 |
+
|
646 |
37%|ββββ | 160/430 [3:37:15<6:02:37, 80.58s/it]
|
647 |
37%|ββββ | 161/430 [3:38:36<6:01:14, 80.58s/it]
|
648 |
38%|ββββ | 162/430 [3:39:56<6:00:01, 80.60s/it]
|
649 |
38%|ββββ | 163/430 [3:41:17<5:59:32, 80.80s/it]
|
650 |
38%|ββββ | 164/430 [3:42:38<5:58:27, 80.86s/it]
|
651 |
38%|ββββ | 165/430 [3:43:59<5:56:38, 80.75s/it]
|
652 |
|
653 |
+
|
654 |
38%|ββββ | 165/430 [3:43:59<5:56:38, 80.75s/it]
|
655 |
39%|ββββ | 166/430 [3:45:20<5:55:05, 80.70s/it]
|
656 |
39%|ββββ | 167/430 [3:46:40<5:53:32, 80.66s/it]
|
657 |
39%|ββββ | 168/430 [3:48:01<5:52:04, 80.63s/it]
|
658 |
39%|ββββ | 169/430 [3:49:21<5:50:54, 80.67s/it]
|
659 |
40%|ββββ | 170/430 [3:50:42<5:49:25, 80.64s/it]
|
660 |
|
661 |
+
|
662 |
40%|ββββ | 170/430 [3:50:42<5:49:25, 80.64s/it]
|
663 |
40%|ββββ | 171/430 [3:52:03<5:48:04, 80.64s/it]
|
664 |
40%|ββββ | 172/430 [3:53:23<5:46:33, 80.59s/it]
|
665 |
40%|ββββ | 173/430 [3:55:33<6:48:29, 95.37s/it]
|
666 |
40%|ββββ | 174/430 [3:56:53<6:27:52, 90.91s/it]
|
667 |
41%|ββββ | 175/430 [3:58:14<6:13:17, 87.83s/it]
|
668 |
|
669 |
+
|
670 |
41%|ββββ | 175/430 [3:58:14<6:13:17, 87.83s/it]
|
671 |
41%|ββββ | 176/430 [3:59:35<6:02:43, 85.68s/it]
|
672 |
41%|ββββ | 177/430 [4:00:55<5:54:43, 84.12s/it]
|
673 |
41%|βββββ | 178/430 [4:02:16<5:48:45, 83.04s/it]
|
674 |
42%|βββββ | 179/430 [4:03:36<5:44:09, 82.27s/it]
|
675 |
42%|βββββ | 180/430 [4:04:57<5:40:41, 81.77s/it]
|
676 |
|
677 |
+
|
678 |
42%|βββββ | 180/430 [4:04:57<5:40:41, 81.77s/it]
|
679 |
42%|βββββ | 181/430 [4:06:17<5:37:50, 81.41s/it]
|
680 |
42%|βββββ | 182/430 [4:07:38<5:35:22, 81.14s/it]
|
681 |
43%|βββββ | 183/430 [4:08:58<5:33:16, 80.96s/it]
|
682 |
43%|βββββ | 184/430 [4:10:19<5:31:25, 80.83s/it]
|
683 |
43%|βββββ | 185/430 [4:11:40<5:29:40, 80.74s/it]
|
684 |
|
685 |
+
|
686 |
43%|βββββ | 185/430 [4:11:40<5:29:40, 80.74s/it]
|
687 |
43%|βββββ | 186/430 [4:13:00<5:28:03, 80.67s/it]
|
688 |
43%|βββββ | 187/430 [4:14:21<5:26:44, 80.68s/it]
|
689 |
44%|βββββ | 188/430 [4:15:41<5:25:24, 80.68s/it]
|
690 |
44%|βββββ | 189/430 [4:17:02<5:23:51, 80.63s/it]
|
691 |
44%|βββββ | 190/430 [4:18:22<5:22:20, 80.58s/it]
|
692 |
|
693 |
+
|
694 |
44%|βββββ | 190/430 [4:18:22<5:22:20, 80.58s/it]
|
695 |
44%|βββββ | 191/430 [4:19:43<5:20:53, 80.56s/it]
|
696 |
45%|βββββ | 192/430 [4:21:03<5:19:33, 80.56s/it]
|
697 |
45%|βββββ | 193/430 [4:22:24<5:18:45, 80.70s/it]
|
698 |
45%|βββββ | 194/430 [4:23:45<5:17:14, 80.65s/it]
|
699 |
45%|βββββ | 195/430 [4:25:06<5:15:47, 80.63s/it]
|
700 |
|
701 |
+
|
702 |
45%|βββββ | 195/430 [4:25:06<5:15:47, 80.63s/it]
|
703 |
46%|βββββ | 196/430 [4:26:26<5:14:14, 80.57s/it]
|
704 |
46%|βββββ | 197/430 [4:27:47<5:12:46, 80.54s/it]
|
705 |
46%|βββββ | 198/430 [4:29:07<5:11:27, 80.55s/it]
|
706 |
46%|βββββ | 199/430 [4:30:28<5:10:06, 80.55s/it]
|
707 |
47%|βββββ | 200/430 [4:31:48<5:08:45, 80.55s/it]
|
708 |
|
709 |
+
|
710 |
47%|βββββ | 200/430 [4:31:48<5:08:45, 80.55s/it]
|
711 |
47%|βββββ | 201/430 [4:33:09<5:07:25, 80.55s/it]
|
712 |
47%|βββββ | 202/430 [4:34:29<5:06:03, 80.54s/it]
|
713 |
47%|βββββ | 203/430 [4:35:50<5:04:47, 80.56s/it]
|
714 |
47%|βββββ | 204/430 [4:37:10<5:03:25, 80.55s/it]
|
715 |
48%|βββββ | 205/430 [4:38:31<5:02:05, 80.56s/it]
|
716 |
|
717 |
+
|
718 |
48%|βββββ | 205/430 [4:38:31<5:02:05, 80.56s/it]
|
719 |
48%|βββββ | 206/430 [4:39:52<5:00:48, 80.57s/it]
|
720 |
48%|βββββ | 207/430 [4:41:12<4:59:23, 80.55s/it]
|
721 |
48%|βββββ | 208/430 [4:42:33<4:58:04, 80.56s/it]
|
722 |
49%|βββββ | 209/430 [4:43:53<4:56:40, 80.54s/it]
|
723 |
49%|βββββ | 210/430 [4:45:14<4:55:18, 80.54s/it]
|
724 |
|
725 |
+
|
726 |
49%|βββββ | 210/430 [4:45:14<4:55:18, 80.54s/it]
|
727 |
49%|βββββ | 211/430 [4:46:35<4:54:27, 80.67s/it]
|
728 |
49%|βββββ | 212/430 [4:47:56<4:53:37, 80.81s/it]
|
729 |
50%|βββββ | 213/430 [4:49:16<4:52:03, 80.75s/it]
|
730 |
50%|βββββ | 214/430 [4:50:37<4:50:43, 80.75s/it]
|
731 |
50%|βββββ | 215/430 [4:51:58<4:49:04, 80.67s/it]
|
732 |
|
733 |
+
|
734 |
50%|βββββ | 215/430 [4:51:58<4:49:04, 80.67s/it]
|
735 |
50%|βββββ | 216/430 [4:54:08<5:40:27, 95.45s/it]
|
736 |
50%|βββββ | 217/430 [4:55:28<5:22:58, 90.98s/it]
|
737 |
51%|βββββ | 218/430 [4:56:49<5:10:22, 87.84s/it]
|
738 |
51%|βββββ | 219/430 [4:58:09<5:01:10, 85.64s/it]
|
739 |
51%|βββββ | 220/430 [4:59:30<4:54:31, 84.15s/it]
|
740 |
|
741 |
+
|
742 |
51%|βββββ | 220/430 [4:59:30<4:54:31, 84.15s/it]
|
743 |
51%|ββββββ | 221/430 [5:00:50<4:49:24, 83.09s/it]
|
744 |
52%|ββββββ | 222/430 [5:02:11<4:45:37, 82.39s/it]
|
745 |
52%|ββββββ | 223/430 [5:03:32<4:42:20, 81.84s/it]
|
746 |
52%|ββββββ | 224/430 [5:04:52<4:39:35, 81.44s/it]
|
747 |
52%|ββββββ | 225/430 [5:06:13<4:37:20, 81.17s/it]
|
748 |
|
749 |
+
|
750 |
52%|ββββββ | 225/430 [5:06:13<4:37:20, 81.17s/it]
|
751 |
53%|ββββββ | 226/430 [5:07:33<4:35:24, 81.00s/it]
|
752 |
53%|ββββββ | 227/430 [5:08:54<4:33:33, 80.85s/it]
|
753 |
53%|ββββββ | 228/430 [5:10:14<4:31:47, 80.73s/it]
|
754 |
53%|ββββββ | 229/430 [5:11:35<4:30:14, 80.67s/it]
|
755 |
53%|ββββββ | 230/430 [5:12:55<4:28:43, 80.62s/it]
|
756 |
|
757 |
+
|
758 |
53%|ββββββ | 230/430 [5:12:55<4:28:43, 80.62s/it]
|
759 |
54%|ββββββ | 231/430 [5:14:16<4:27:16, 80.58s/it]
|
760 |
54%|ββββββ | 232/430 [5:15:36<4:25:49, 80.55s/it]
|
761 |
54%|ββββββ | 233/430 [5:16:57<4:24:23, 80.53s/it]
|
762 |
54%|ββββββ | 234/430 [5:18:17<4:22:58, 80.50s/it]
|
763 |
55%|ββββββ | 235/430 [5:19:38<4:21:32, 80.48s/it]
|
764 |
|
765 |
+
|
766 |
55%|ββββββ | 235/430 [5:19:38<4:21:32, 80.48s/it]
|
767 |
55%|ββββββ | 236/430 [5:20:58<4:20:16, 80.50s/it]
|
768 |
55%|ββββββ | 237/430 [5:22:19<4:18:53, 80.49s/it]
|
769 |
55%|ββββββ | 238/430 [5:23:39<4:17:33, 80.49s/it]
|
770 |
56%|ββββββ | 239/430 [5:25:00<4:16:10, 80.47s/it]
|
771 |
56%|ββββββ | 240/430 [5:26:20<4:14:47, 80.46s/it]
|
772 |
|
773 |
+
|
774 |
56%|ββββββ | 240/430 [5:26:20<4:14:47, 80.46s/it]
|
775 |
56%|ββββββ | 241/430 [5:27:40<4:13:24, 80.45s/it]
|
776 |
56%|ββββββ | 242/430 [5:29:01<4:12:06, 80.46s/it]
|
777 |
57%|ββββββ | 243/430 [5:30:21<4:10:48, 80.47s/it]
|
778 |
57%|ββββββ | 244/430 [5:31:42<4:09:29, 80.48s/it]
|
779 |
57%|ββββββ | 245/430 [5:33:02<4:08:08, 80.48s/it]
|
780 |
|
781 |
+
|
782 |
57%|ββββββ | 245/430 [5:33:02<4:08:08, 80.48s/it]
|
783 |
57%|ββββββ | 246/430 [5:34:23<4:06:46, 80.47s/it]
|
784 |
57%|ββββββ | 247/430 [5:35:44<4:05:59, 80.65s/it]
|
785 |
58%|ββββββ | 248/430 [5:37:04<4:04:28, 80.59s/it]
|
786 |
58%|ββββββ | 249/430 [5:38:25<4:03:06, 80.59s/it]
|
787 |
58%|ββββββ | 250/430 [5:39:46<4:01:45, 80.59s/it]
|
788 |
|
789 |
+
|
790 |
58%|ββββββ | 250/430 [5:39:46<4:01:45, 80.59s/it]
|
791 |
58%|ββββββ | 251/430 [5:41:06<4:00:32, 80.63s/it]
|
792 |
59%|ββββββ | 252/430 [5:42:27<3:59:37, 80.77s/it]
|
793 |
59%|ββββββ | 253/430 [5:43:48<3:58:03, 80.70s/it]
|
794 |
59%|ββββββ | 254/430 [5:45:08<3:56:32, 80.64s/it]
|
795 |
59%|ββββββ | 255/430 [5:46:29<3:55:05, 80.60s/it]
|
796 |
|
797 |
+
|
798 |
59%|ββββββ | 255/430 [5:46:29<3:55:05, 80.60s/it]
|
799 |
60%|ββββββ | 256/430 [5:47:50<3:54:05, 80.72s/it]
|
800 |
60%|ββββββ | 257/430 [5:49:10<3:52:33, 80.65s/it]
|
801 |
60%|ββββββ | 258/430 [5:50:31<3:51:09, 80.64s/it]
|
802 |
60%|ββββββ | 259/430 [5:52:40<4:31:29, 95.26s/it]
|
803 |
60%|ββββββ | 260/430 [5:54:01<4:17:26, 90.86s/it]
|
804 |
|
805 |
+
|
806 |
60%|ββββββ | 260/430 [5:54:01<4:17:26, 90.86s/it]
|
807 |
61%|ββββββ | 261/430 [5:55:22<4:07:10, 87.75s/it]
|
808 |
61%|ββββββ | 262/430 [5:56:42<3:59:47, 85.64s/it]
|
809 |
61%|ββββββ | 263/430 [5:58:03<3:54:02, 84.09s/it]
|
810 |
61%|βββββββ | 264/430 [5:59:23<3:49:37, 83.00s/it]
|
811 |
62%|βββββββ | 265/430 [6:00:44<3:46:08, 82.23s/it]
|
812 |
|
813 |
+
|
814 |
62%|βββββββ | 265/430 [6:00:44<3:46:08, 82.23s/it]
|
815 |
62%|βββββββ | 266/430 [6:02:04<3:43:19, 81.70s/it]
|
816 |
62%|βββββββ | 267/430 [6:03:25<3:40:59, 81.34s/it]
|
817 |
62%|βββββββ | 268/430 [6:04:45<3:39:08, 81.16s/it]
|
818 |
63%|βββββββ | 269/430 [6:06:06<3:37:16, 80.97s/it]
|
819 |
63%|βββββββ | 270/430 [6:07:26<3:35:33, 80.84s/it]
|
820 |
|
821 |
+
|
822 |
63%|βββββββ | 270/430 [6:07:26<3:35:33, 80.84s/it]
|
823 |
63%|βββββββ | 271/430 [6:08:47<3:34:06, 80.80s/it]
|
824 |
63%|βββββββ | 272/430 [6:10:08<3:32:33, 80.72s/it]
|
825 |
63%|βββββββ | 273/430 [6:11:28<3:31:01, 80.65s/it]
|
826 |
64%|βββββββ | 274/430 [6:12:49<3:29:32, 80.60s/it]
|
827 |
64%|βββββββ | 275/430 [6:14:09<3:28:03, 80.54s/it]
|
828 |
|
829 |
+
|
830 |
64%|βββββββ | 275/430 [6:14:09<3:28:03, 80.54s/it]
|
831 |
64%|βββββββ | 276/430 [6:15:29<3:26:41, 80.53s/it]
|
832 |
64%|βββββββ | 277/430 [6:16:50<3:25:18, 80.51s/it]
|
833 |
65%|βββββββ | 278/430 [6:18:10<3:23:55, 80.50s/it]
|
834 |
65%|βββββββ | 279/430 [6:19:31<3:22:38, 80.52s/it]
|
835 |
65%|βββββββ | 280/430 [6:20:52<3:21:30, 80.60s/it]
|
836 |
|
837 |
+
|
838 |
65%|βββββββ | 280/430 [6:20:52<3:21:30, 80.60s/it]
|
839 |
65%|βββββββ | 281/430 [6:22:12<3:20:06, 80.58s/it]
|
840 |
66%|βββββββ | 282/430 [6:23:33<3:18:45, 80.58s/it]
|
841 |
66%|βββββββ | 283/430 [6:24:54<3:17:25, 80.58s/it]
|
842 |
66%|βββββββ | 284/430 [6:26:14<3:16:05, 80.59s/it]
|
843 |
66%|βββββββ | 285/430 [6:27:35<3:14:40, 80.56s/it]
|
844 |
|
845 |
+
|
846 |
66%|βββββββ | 285/430 [6:27:35<3:14:40, 80.56s/it]
|
847 |
67%|βββββββ | 286/430 [6:28:55<3:13:19, 80.55s/it]
|
848 |
67%|βββββββ | 287/430 [6:30:16<3:11:55, 80.53s/it]
|
849 |
67%|βββββββ | 288/430 [6:31:37<3:10:52, 80.65s/it]
|
850 |
67%|βββββββ | 289/430 [6:32:57<3:09:26, 80.62s/it]
|
851 |
67%|βββββββ | 290/430 [6:34:18<3:07:59, 80.57s/it]
|
852 |
|
853 |
+
|
854 |
67%|βββββββ | 290/430 [6:34:18<3:07:59, 80.57s/it]
|
855 |
68%|βββββββ | 291/430 [6:35:39<3:06:56, 80.69s/it]
|
856 |
68%|βββββββ | 292/430 [6:36:59<3:05:26, 80.63s/it]
|
857 |
68%|βββββββ | 293/430 [6:38:20<3:04:03, 80.61s/it]
|
858 |
68%|βββββββ | 294/430 [6:39:40<3:02:37, 80.57s/it]
|
859 |
69%|βββββββ | 295/430 [6:41:01<3:01:16, 80.56s/it]
|
860 |
|
861 |
+
|
862 |
69%|βββββββ | 295/430 [6:41:01<3:01:16, 80.56s/it]
|
863 |
69%|βββββββ | 296/430 [6:42:21<2:59:53, 80.55s/it]
|
864 |
69%|βββββββ | 297/430 [6:43:42<2:58:55, 80.72s/it]
|
865 |
69%|βββββββ | 298/430 [6:45:03<2:57:26, 80.66s/it]
|
866 |
70%|βββββββ | 299/430 [6:46:23<2:55:57, 80.59s/it]
|
867 |
70%|βββββββ | 300/430 [6:47:44<2:54:31, 80.55s/it]
|
868 |
|
869 |
+
|
870 |
70%|βββββββ | 300/430 [6:47:44<2:54:31, 80.55s/it]Saving model checkpoint to ./results/checkpoint-300
|
871 |
+
Configuration saved in ./results/checkpoint-300/config.json
|
872 |
+
Model weights saved in ./results/checkpoint-300/pytorch_model.bin
|
873 |
+
tokenizer config file saved in ./results/checkpoint-300/tokenizer_config.json
|
874 |
+
Special tokens file saved in ./results/checkpoint-300/special_tokens_map.json
|
875 |
+
[2023-02-20 23:54:13,030] [INFO] [logging.py:75:log_dist] [Rank 0] [Torch] Checkpoint global_step303 is begin to save!
|
876 |
+
[2023-02-20 23:54:13,032] [INFO] [logging.py:75:log_dist] [Rank 0] Saving model checkpoint: ./results/checkpoint-300/global_step303/mp_rank_00_model_states.pt
|
877 |
+
[2023-02-20 23:54:13,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving ./results/checkpoint-300/global_step303/mp_rank_00_model_states.pt...
|
878 |
+
[2023-02-20 23:54:13,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved ./results/checkpoint-300/global_step303/mp_rank_00_model_states.pt.
|
879 |
+
[2023-02-20 23:54:13,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving ./results/checkpoint-300/global_step303/zero_pp_rank_0_mp_rank_00_optim_states.pt...
|
880 |
+
[2023-02-20 23:54:13,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved ./results/checkpoint-300/global_step303/zero_pp_rank_0_mp_rank_00_optim_states.pt.
|
881 |
+
[2023-02-20 23:54:13,740] [INFO] [engine.py:3407:_save_zero_checkpoint] zero checkpoint saved ./results/checkpoint-300/global_step303/zero_pp_rank_0_mp_rank_00_optim_states.pt
|
882 |
+
[2023-02-20 23:54:13,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step303 is ready now!
|
883 |
+
Deleting older checkpoint [results/checkpoint-150] due to args.save_total_limit
|
884 |
+
|
885 |
70%|βββββββ | 301/430 [6:49:06<2:54:08, 80.99s/it]
|
886 |
70%|βββββββ | 302/430 [6:51:15<3:24:02, 95.64s/it]
|
887 |
70%|βββββββ | 303/430 [6:52:36<3:12:48, 91.09s/it]
|
888 |
71%|βββββββ | 304/430 [6:53:56<3:04:36, 87.91s/it]
|
889 |
71%|βββββββ | 305/430 [6:55:17<2:58:30, 85.68s/it]
|
890 |
|
891 |
+
|
892 |
71%|βββββββ | 305/430 [6:55:17<2:58:30, 85.68s/it]
|
893 |
71%|βββββββ | 306/430 [6:56:38<2:54:00, 84.19s/it]
|
894 |
71%|ββββββββ | 307/430 [6:57:59<2:50:44, 83.29s/it]
|
895 |
72%|ββββββββ | 308/430 [6:59:19<2:47:40, 82.46s/it]
|
896 |
72%|ββββββββ | 309/430 [7:00:40<2:45:10, 81.91s/it]
|
897 |
72%|ββββββββ | 310/430 [7:02:00<2:42:57, 81.48s/it]
|
898 |
|
899 |
+
|
900 |
72%|ββββββββ | 310/430 [7:02:00<2:42:57, 81.48s/it]
|
901 |
72%|ββββββββ | 311/430 [7:03:21<2:41:01, 81.18s/it]
|
902 |
73%|ββββββββ | 312/430 [7:04:41<2:39:14, 80.97s/it]
|
903 |
73%|ββββββββ | 313/430 [7:06:02<2:37:35, 80.82s/it]
|
904 |
73%|ββββββββ | 314/430 [7:07:22<2:36:02, 80.71s/it]
|
905 |
73%|ββββββββ | 315/430 [7:08:43<2:34:33, 80.64s/it]
|
906 |
|
907 |
+
|
908 |
73%|ββββββββ | 315/430 [7:08:43<2:34:33, 80.64s/it]
|
909 |
73%|ββββββββ | 316/430 [7:10:03<2:33:10, 80.62s/it]
|
910 |
74%|ββββββββ | 317/430 [7:11:24<2:31:46, 80.59s/it]
|
911 |
74%|ββββββββ | 318/430 [7:12:44<2:30:23, 80.56s/it]
|
912 |
74%|ββββββββ | 319/430 [7:14:05<2:29:01, 80.56s/it]
|
913 |
74%|ββββββββ | 320/430 [7:15:26<2:27:42, 80.57s/it]
|
914 |
|
915 |
+
|
916 |
74%|ββββββββ | 320/430 [7:15:26<2:27:42, 80.57s/it]
|
917 |
75%|ββββββββ | 321/430 [7:16:46<2:26:19, 80.54s/it]
|
918 |
75%|ββββββββ | 322/430 [7:18:07<2:24:58, 80.54s/it]
|
919 |
75%|ββββββββ | 323/430 [7:19:27<2:23:39, 80.55s/it]
|
920 |
75%|ββββββββ | 324/430 [7:20:48<2:22:15, 80.53s/it]
|
921 |
76%|ββββββββ | 325/430 [7:22:08<2:20:55, 80.53s/it]
|
922 |
|
923 |
+
|
924 |
76%|ββββββββ | 325/430 [7:22:08<2:20:55, 80.53s/it]
|
925 |
76%|ββββββββ | 326/430 [7:23:29<2:19:34, 80.52s/it]
|
926 |
76%|ββββββββ | 327/430 [7:24:49<2:18:11, 80.50s/it]
|
927 |
76%|ββββββββ | 328/430 [7:26:10<2:16:52, 80.52s/it]
|
928 |
77%|ββββββββ | 329/430 [7:27:30<2:15:32, 80.52s/it]
|
929 |
77%|ββββββββ | 330/430 [7:28:51<2:14:13, 80.53s/it]
|
930 |
|
931 |
+
|
932 |
77%|ββββββββ | 330/430 [7:28:51<2:14:13, 80.53s/it]
|
933 |
77%|ββββββββ | 331/430 [7:30:11<2:12:57, 80.58s/it]
|
934 |
77%|ββββββββ | 332/430 [7:31:32<2:11:44, 80.66s/it]
|
935 |
77%|ββββββββ | 333/430 [7:32:53<2:10:32, 80.75s/it]
|
936 |
78%|ββββββββ | 334/430 [7:34:14<2:09:04, 80.67s/it]
|
937 |
78%|ββββββββ | 335/430 [7:35:34<2:07:39, 80.62s/it]
|
938 |
|
939 |
+
|
940 |
78%|ββββββββ | 335/430 [7:35:34<2:07:39, 80.62s/it]
|
941 |
78%|ββββββββ | 336/430 [7:36:55<2:06:30, 80.75s/it]
|
942 |
78%|ββββββββ | 337/430 [7:38:16<2:05:08, 80.73s/it]
|
943 |
79%|ββββββββ | 338/430 [7:39:36<2:03:41, 80.66s/it]
|
944 |
79%|ββββββββ | 339/430 [7:40:57<2:02:15, 80.61s/it]
|
945 |
79%|ββββββββ | 340/430 [7:42:17<2:00:50, 80.56s/it]
|
946 |
|
947 |
+
|
948 |
79%|ββββββββ | 340/430 [7:42:17<2:00:50, 80.56s/it]
|
949 |
79%|ββββββββ | 341/430 [7:43:38<1:59:27, 80.53s/it]
|
950 |
80%|ββββββββ | 342/430 [7:44:58<1:58:06, 80.53s/it]
|
951 |
80%|ββββββββ | 343/430 [7:46:19<1:56:47, 80.55s/it]
|
952 |
80%|ββββββββ | 344/430 [7:47:40<1:55:30, 80.59s/it]
|
953 |
80%|ββββββββ | 345/430 [7:49:49<2:14:57, 95.26s/it]
|
954 |
|
955 |
+
|
956 |
80%|ββββββββ | 345/430 [7:49:49<2:14:57, 95.26s/it]
|
957 |
80%|ββββββββ | 346/430 [7:51:10<2:07:10, 90.84s/it]
|
958 |
81%|ββββββββ | 347/430 [7:52:30<2:01:21, 87.73s/it]
|
959 |
81%|ββββββββ | 348/430 [7:53:51<1:56:55, 85.56s/it]
|
960 |
81%|ββββββββ | 349/430 [7:55:11<1:53:26, 84.03s/it]
|
961 |
81%|βββββββββ | 350/430 [7:56:32<1:50:39, 83.00s/it]
|
962 |
|
963 |
+
|
964 |
81%|βββββββββ | 350/430 [7:56:32<1:50:39, 83.00s/it]
|
965 |
82%|βββββββββ | 351/430 [7:57:52<1:48:17, 82.24s/it]
|
966 |
82%|βββββββββ | 352/430 [7:59:13<1:46:14, 81.72s/it]
|
967 |
82%|βββββββββ | 353/430 [8:00:34<1:44:40, 81.56s/it]
|
968 |
82%|βββββββββ | 354/430 [8:01:54<1:42:53, 81.22s/it]
|
969 |
83%|βββββββββ | 355/430 [8:03:15<1:41:14, 80.99s/it]
|
970 |
|
971 |
+
|
972 |
83%|βββββββββ | 355/430 [8:03:15<1:41:14, 80.99s/it]
|
973 |
83%|βββββββββ | 356/430 [8:04:36<1:39:57, 81.05s/it]
|
974 |
83%|βββββββββ | 357/430 [8:05:56<1:38:25, 80.90s/it]
|
975 |
83%|βββββββββ | 358/430 [8:07:17<1:36:55, 80.77s/it]
|
976 |
83%|βββββββββ | 359/430 [8:08:38<1:35:30, 80.71s/it]
|
977 |
84%|βββββββββ | 360/430 [8:09:58<1:34:03, 80.63s/it]
|
978 |
|
979 |
+
|
980 |
84%|βββββββββ | 360/430 [8:09:58<1:34:03, 80.63s/it]
|
981 |
84%|βββββββββ | 361/430 [8:11:18<1:32:39, 80.57s/it]
|
982 |
84%|βββββββββ | 362/430 [8:12:39<1:31:17, 80.55s/it]
|
983 |
84%|βββββββββ | 363/430 [8:13:59<1:29:56, 80.54s/it]
|
984 |
85%|βββββββββ | 364/430 [8:15:20<1:28:34, 80.52s/it]
|
985 |
85%|βββββββββ | 365/430 [8:16:40<1:27:12, 80.50s/it]
|
986 |
|
987 |
+
|
988 |
85%|βββββββββ | 365/430 [8:16:40<1:27:12, 80.50s/it]
|
989 |
85%|βββββββββ | 366/430 [8:18:01<1:25:49, 80.47s/it]
|
990 |
85%|βββββββββ | 367/430 [8:19:21<1:24:27, 80.44s/it]
|
991 |
86%|βββββββββ | 368/430 [8:20:42<1:23:10, 80.49s/it]
|
992 |
86%|βββββββββ | 369/430 [8:22:02<1:21:48, 80.47s/it]
|
993 |
86%|βββββββββ | 370/430 [8:23:23<1:20:29, 80.49s/it]
|
994 |
|
995 |
+
|
996 |
86%|βββββββββ | 370/430 [8:23:23<1:20:29, 80.49s/it]
|
997 |
86%|βββββββββ | 371/430 [8:24:43<1:19:09, 80.51s/it]
|
998 |
87%|βββββββββ | 372/430 [8:26:04<1:17:55, 80.61s/it]
|
999 |
87%|βββββββββ | 373/430 [8:27:25<1:16:32, 80.57s/it]
|
1000 |
87%|βββββββββ | 374/430 [8:28:45<1:15:10, 80.55s/it]
|
1001 |
87%|βββββββββ | 375/430 [8:30:06<1:13:49, 80.54s/it]
|
1002 |
|
1003 |
+
|
1004 |
87%|βββββββββ | 375/430 [8:30:06<1:13:49, 80.54s/it]
|
1005 |
87%|βββββββββ | 376/430 [8:31:26<1:12:29, 80.54s/it]
|
1006 |
88%|βββββββββ | 377/430 [8:32:47<1:11:07, 80.53s/it]
|
1007 |
88%|βββββββββ | 378/430 [8:34:07<1:09:47, 80.53s/it]
|
1008 |
88%|βββββββββ | 379/430 [8:35:28<1:08:26, 80.52s/it]
|
1009 |
88%|βββββββββ | 380/430 [8:36:48<1:07:07, 80.55s/it]
|
1010 |
|
1011 |
+
|
1012 |
88%|βββββββββ | 380/430 [8:36:48<1:07:07, 80.55s/it]
|
1013 |
89%|βββββββββ | 381/430 [8:38:09<1:05:47, 80.56s/it]
|
1014 |
89%|βββββββββ | 382/430 [8:39:29<1:04:25, 80.53s/it]
|
1015 |
89%|βββββββββ | 383/430 [8:40:50<1:03:04, 80.53s/it]
|
1016 |
89%|βββββββββ | 384/430 [8:42:10<1:01:43, 80.51s/it]
|
1017 |
90%|βββββββββ | 385/430 [8:43:31<1:00:22, 80.49s/it]
|
1018 |
|
1019 |
+
|
1020 |
90%|βββββββββ | 385/430 [8:43:31<1:00:22, 80.49s/it]
|
1021 |
90%|βββββββββ | 386/430 [8:44:52<59:06, 80.61s/it]
|
1022 |
90%|βββββββββ | 387/430 [8:46:12<57:43, 80.55s/it]
|
1023 |
90%|βββββββββ | 388/430 [8:48:22<1:06:42, 95.31s/it]
|
1024 |
90%|βββββββββ | 389/430 [8:49:42<1:02:05, 90.85s/it]
|
1025 |
91%|βββββββββ | 390/430 [8:51:03<58:29, 87.73s/it]
|
1026 |
|
1027 |
+
|
1028 |
91%|βββββββββ | 390/430 [8:51:03<58:29, 87.73s/it]
|
1029 |
91%|βββββββββ | 391/430 [8:52:23<55:37, 85.58s/it]
|
1030 |
91%|βββββββββ | 392/430 [8:53:44<53:14, 84.05s/it]
|
1031 |
91%|ββββββββββ| 393/430 [8:55:04<51:10, 82.98s/it]
|
1032 |
92%|ββββββββββ| 394/430 [8:56:25<49:25, 82.39s/it]
|
1033 |
92%|ββββββββββ| 395/430 [8:57:46<47:43, 81.82s/it]
|
1034 |
|
1035 |
+
|
1036 |
92%|ββββββββββ| 395/430 [8:57:46<47:43, 81.82s/it]
|
1037 |
92%|ββββββββββ| 396/430 [8:59:06<46:09, 81.45s/it]
|
1038 |
92%|ββββββββββ| 397/430 [9:00:27<44:37, 81.15s/it]
|
1039 |
93%|ββββββββββ| 398/430 [9:01:47<43:09, 80.93s/it]
|
1040 |
93%|ββββββββββ| 399/430 [9:03:08<41:44, 80.80s/it]
|
1041 |
93%|ββββββββββ| 400/430 [9:04:28<40:20, 80.70s/it]
|
1042 |
|
1043 |
+
|
1044 |
93%|ββββββββββ| 400/430 [9:04:28<40:20, 80.70s/it]
|
1045 |
93%|ββββββββββ| 401/430 [9:05:49<38:58, 80.65s/it]
|
1046 |
93%|ββββββββββ| 402/430 [9:07:09<37:38, 80.65s/it]
|
1047 |
94%|ββββββββββ| 403/430 [9:08:30<36:16, 80.59s/it]
|
1048 |
94%|ββββββββββ| 404/430 [9:09:50<34:54, 80.56s/it]
|
1049 |
94%|ββββββββββ| 405/430 [9:11:11<33:33, 80.54s/it]
|
1050 |
|
1051 |
+
|
1052 |
94%|ββββββββββ| 405/430 [9:11:11<33:33, 80.54s/it]
|
1053 |
94%|ββββββββββ| 406/430 [9:12:32<32:14, 80.61s/it]
|
1054 |
95%|ββββββββββ| 407/430 [9:13:52<30:52, 80.56s/it]
|
1055 |
95%|ββββββββββ| 408/430 [9:15:12<29:31, 80.54s/it]
|
1056 |
95%|ββββββββββ| 409/430 [9:16:33<28:11, 80.54s/it]
|
1057 |
95%|ββββββββββ| 410/430 [9:17:53<26:50, 80.51s/it]
|
1058 |
|
1059 |
+
|
1060 |
95%|ββββββββββ| 410/430 [9:17:53<26:50, 80.51s/it]
|
1061 |
96%|ββββββββββ| 411/430 [9:19:15<25:33, 80.70s/it]
|
1062 |
96%|ββββββββββ| 412/430 [9:20:35<24:11, 80.65s/it]
|
1063 |
96%|ββββββββββ| 413/430 [9:21:56<22:50, 80.61s/it]
|
1064 |
96%|ββββββββββ| 414/430 [9:23:16<21:28, 80.56s/it]
|
1065 |
97%|ββββββββββ| 415/430 [9:24:37<20:09, 80.62s/it]
|
1066 |
|
1067 |
+
|
1068 |
97%|ββββββββββ| 415/430 [9:24:37<20:09, 80.62s/it]
|
1069 |
97%|ββββββββββ| 416/430 [9:25:57<18:48, 80.58s/it]
|
1070 |
97%|ββββββββββ| 417/430 [9:27:18<17:27, 80.54s/it]
|
1071 |
97%|ββββββββββ| 418/430 [9:28:38<16:06, 80.54s/it]
|
1072 |
97%|ββββββββββ| 419/430 [9:29:59<14:45, 80.51s/it]
|
1073 |
98%|ββββββββββ| 420/430 [9:31:20<13:26, 80.65s/it]
|
1074 |
|
1075 |
+
|
1076 |
98%|ββββββββββ| 420/430 [9:31:20<13:26, 80.65s/it]
|
1077 |
98%|ββββββββββ| 421/430 [9:32:40<12:05, 80.59s/it]
|
1078 |
98%|ββββββββββ| 422/430 [9:34:01<10:44, 80.56s/it]
|
1079 |
98%|ββββββββββ| 423/430 [9:35:21<09:23, 80.53s/it]
|
1080 |
99%|ββββββββββ| 424/430 [9:36:42<08:03, 80.54s/it]
|
1081 |
99%|ββββββββββ| 425/430 [9:38:02<06:42, 80.52s/it]
|
1082 |
|
1083 |
+
|
1084 |
99%|ββββββββββ| 425/430 [9:38:02<06:42, 80.52s/it]
|
1085 |
99%|ββββββββββ| 426/430 [9:39:23<05:22, 80.52s/it]
|
1086 |
99%|ββββββββββ| 427/430 [9:40:43<04:01, 80.52s/it]
|
1087 |
|
1088 |
+
|
1089 |
+
|
1090 |
+
Training completed. Do not forget to share your model on huggingface.co/models =)
|
1091 |
+
|
1092 |
+
|
1093 |
+
Time: 35085.85Time: 35085.80
|
1094 |
+
Time: 35085.78
|
1095 |
+
Samples/second: 33.41Time: 35085.88
|
1096 |
+
Samples/second: 33.41
|
1097 |
+
|
1098 |
+
Samples/second: 33.41
|
1099 |
+
Samples/second: 33.41
|
1100 |
+
|
1101 |
+
|
1102 |
|
1103 |
+
Time: 35085.79
|
1104 |
+
Time: 35085.76Samples/second: 33.41
|
1105 |
+
|
1106 |
+
Samples/second: 33.41
|
1107 |
+
|
1108 |
+
GPU memory occupied: 43825 MB.
|
1109 |
+
GPU memory occupied: 43825 MB.
|
1110 |
+
GPU memory occupied: 43825 MB.
|
1111 |
+
GPU memory occupied: 43825 MB.
|
1112 |
+
GPU memory occupied: 43825 MB.
|
1113 |
+
|
1114 |
+
Time: 35085.49
|
1115 |
+
Samples/second: 33.41
|
1116 |
+
GPU memory occupied: 43825 MB.
|
1117 |
+
Configuration saved in experiments/2023-02-21-b0010c97cb1f06debca911602ea05b6ff85a8270fb9487d27b3d52eb4eb29e9e/final_checkpoint/config.json
|
1118 |
+
Model weights saved in experiments/2023-02-21-b0010c97cb1f06debca911602ea05b6ff85a8270fb9487d27b3d52eb4eb29e9e/final_checkpoint/pytorch_model.bin
|
1119 |
+
tokenizer config file saved in experiments/2023-02-21-b0010c97cb1f06debca911602ea05b6ff85a8270fb9487d27b3d52eb4eb29e9e/final_checkpoint/tokenizer/tokenizer_config.json
|
1120 |
+
Special tokens file saved in experiments/2023-02-21-b0010c97cb1f06debca911602ea05b6ff85a8270fb9487d27b3d52eb4eb29e9e/final_checkpoint/tokenizer/special_tokens_map.json
|
1121 |
+
Saving model checkpoint to experiments/2023-02-21-b0010c97cb1f06debca911602ea05b6ff85a8270fb9487d27b3d52eb4eb29e9e/trainer_final_checkpoint
|
1122 |
+
Configuration saved in experiments/2023-02-21-b0010c97cb1f06debca911602ea05b6ff85a8270fb9487d27b3d52eb4eb29e9e/trainer_final_checkpoint/config.json
|
1123 |
+
Model weights saved in experiments/2023-02-21-b0010c97cb1f06debca911602ea05b6ff85a8270fb9487d27b3d52eb4eb29e9e/trainer_final_checkpoint/pytorch_model.bin
|
1124 |
+
tokenizer config file saved in experiments/2023-02-21-b0010c97cb1f06debca911602ea05b6ff85a8270fb9487d27b3d52eb4eb29e9e/trainer_final_checkpoint/tokenizer_config.json
|
1125 |
+
Special tokens file saved in experiments/2023-02-21-b0010c97cb1f06debca911602ea05b6ff85a8270fb9487d27b3d52eb4eb29e9e/trainer_final_checkpoint/special_tokens_map.json
|
1126 |
+
Traceback (most recent call last):
|
1127 |
+
File "tune_gpt.py", line 227, in <module>
|
1128 |
+
trainer.save_state(trainer_save_dir)
|
1129 |
+
TypeError: save_state() takes 1 positional argument but 2 were given
|
1130 |
+
[2023-02-21 02:51:15,357] [INFO] [launch.py:350:main] Process 31459 exits successfully.
|
1131 |
+
[2023-02-21 02:51:15,358] [INFO] [launch.py:350:main] Process 31463 exits successfully.
|
1132 |
+
[2023-02-21 02:51:16,360] [INFO] [launch.py:350:main] Process 31486 exits successfully.
|
1133 |
+
[2023-02-21 02:51:16,360] [INFO] [launch.py:350:main] Process 31471 exits successfully.
|
1134 |
+
[2023-02-21 02:51:16,360] [INFO] [launch.py:350:main] Process 31478 exits successfully.
|
1135 |
+
[2023-02-21 02:51:16,361] [INFO] [launch.py:350:main] Process 31490 exits successfully.
|
1136 |
+
[2023-02-21 02:51:17,362] [INFO] [launch.py:318:sigkill_handler] Killing subprocess 31458
|
1137 |
+
[2023-02-21 02:51:17,363] [INFO] [launch.py:318:sigkill_handler] Killing subprocess 31459
|
1138 |
+
[2023-02-21 02:51:17,363] [INFO] [launch.py:318:sigkill_handler] Killing subprocess 31463
|
1139 |
+
[2023-02-21 02:51:17,363] [INFO] [launch.py:318:sigkill_handler] Killing subprocess 31471
|
1140 |
+
[2023-02-21 02:51:17,363] [INFO] [launch.py:318:sigkill_handler] Killing subprocess 31478
|
1141 |
+
[2023-02-21 02:51:17,363] [INFO] [launch.py:318:sigkill_handler] Killing subprocess 31486
|
1142 |
+
[2023-02-21 02:51:17,363] [INFO] [launch.py:318:sigkill_handler] Killing subprocess 31490
|
1143 |
+
[2023-02-21 02:51:17,364] [ERROR] [launch.py:324:sigkill_handler] ['/opt/conda/bin/python3', '-u', 'tune_gpt.py', '--local_rank=6', '--deepspeed', 'deepspeed.json', '--upload-experiment'] exits with return code = 1
|
1144 |
+
/opt/conda/lib/python3.8/site-packages/xgboost/compat.py:36: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
|
1145 |
+
from pandas import MultiIndex, Int64Index
|