foxanthis commited on
Commit
695c6b6
1 Parent(s): e560188

Upload tokenizer

Browse files
added_tokens.json CHANGED
@@ -1,40 +1,6 @@
1
  {
2
- "\t\t": 50294,
3
- "\t\t\t": 50293,
4
- "\t\t\t\t": 50292,
5
- "\t\t\t\t\t": 50291,
6
- "\t\t\t\t\t\t": 50290,
7
- "\t\t\t\t\t\t\t": 50289,
8
- "\t\t\t\t\t\t\t\t": 50288,
9
- "\t\t\t\t\t\t\t\t\t": 50287,
10
- " ": 50286,
11
- " ": 50285,
12
- " ": 50284,
13
- " ": 50283,
14
- " ": 50282,
15
- " ": 50281,
16
- " ": 50280,
17
- " ": 50279,
18
- " ": 50278,
19
- " ": 50277,
20
- " ": 50276,
21
- " ": 50275,
22
- " ": 50274,
23
- " ": 50273,
24
- " ": 50272,
25
- " ": 50271,
26
- " ": 50270,
27
- " ": 50269,
28
- " ": 50268,
29
- " ": 50267,
30
- " ": 50266,
31
- " ": 50265,
32
- " ": 50264,
33
- " ": 50263,
34
- " ": 50262,
35
- " ": 50261,
36
- " ": 50260,
37
- " ": 50259,
38
- " ": 50258,
39
- " ": 50257
40
  }
 
1
  {
2
+ "\t\t": 50257,
3
+ " ": 50258,
4
+ " ": 50259,
5
+ " ": 50260
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  }
special_tokens_map.json CHANGED
@@ -1,5 +1,29 @@
1
  {
2
- "bos_token": "<|endoftext|>",
3
- "eos_token": "<|endoftext|>",
4
- "unk_token": "<|endoftext|>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  }
 
1
  {
2
+ "additional_special_tokens": [
3
+ "\t\t",
4
+ " ",
5
+ " ",
6
+ " "
7
+ ],
8
+ "bos_token": {
9
+ "content": "<|endoftext|>",
10
+ "lstrip": false,
11
+ "normalized": true,
12
+ "rstrip": false,
13
+ "single_word": false
14
+ },
15
+ "eos_token": {
16
+ "content": "<|endoftext|>",
17
+ "lstrip": false,
18
+ "normalized": true,
19
+ "rstrip": false,
20
+ "single_word": false
21
+ },
22
+ "unk_token": {
23
+ "content": "<|endoftext|>",
24
+ "lstrip": false,
25
+ "normalized": true,
26
+ "rstrip": false,
27
+ "single_word": false
28
+ }
29
  }
tokenizer.json CHANGED
@@ -9,350 +9,44 @@
9
  "single_word": false,
10
  "lstrip": false,
11
  "rstrip": false,
12
- "normalized": false,
13
  "special": true
14
  },
15
  {
16
  "id": 50257,
17
- "content": " ",
18
  "single_word": false,
19
  "lstrip": false,
20
  "rstrip": false,
21
- "normalized": true,
22
- "special": false
23
  },
24
  {
25
  "id": 50258,
26
- "content": " ",
27
  "single_word": false,
28
  "lstrip": false,
29
  "rstrip": false,
30
- "normalized": true,
31
- "special": false
32
  },
33
  {
34
  "id": 50259,
35
- "content": " ",
36
  "single_word": false,
37
  "lstrip": false,
38
  "rstrip": false,
39
- "normalized": true,
40
- "special": false
41
  },
42
  {
43
  "id": 50260,
44
- "content": " ",
45
- "single_word": false,
46
- "lstrip": false,
47
- "rstrip": false,
48
- "normalized": true,
49
- "special": false
50
- },
51
- {
52
- "id": 50261,
53
- "content": " ",
54
- "single_word": false,
55
- "lstrip": false,
56
- "rstrip": false,
57
- "normalized": true,
58
- "special": false
59
- },
60
- {
61
- "id": 50262,
62
- "content": " ",
63
- "single_word": false,
64
- "lstrip": false,
65
- "rstrip": false,
66
- "normalized": true,
67
- "special": false
68
- },
69
- {
70
- "id": 50263,
71
- "content": " ",
72
- "single_word": false,
73
- "lstrip": false,
74
- "rstrip": false,
75
- "normalized": true,
76
- "special": false
77
- },
78
- {
79
- "id": 50264,
80
- "content": " ",
81
- "single_word": false,
82
- "lstrip": false,
83
- "rstrip": false,
84
- "normalized": true,
85
- "special": false
86
- },
87
- {
88
- "id": 50265,
89
- "content": " ",
90
- "single_word": false,
91
- "lstrip": false,
92
- "rstrip": false,
93
- "normalized": true,
94
- "special": false
95
- },
96
- {
97
- "id": 50266,
98
- "content": " ",
99
- "single_word": false,
100
- "lstrip": false,
101
- "rstrip": false,
102
- "normalized": true,
103
- "special": false
104
- },
105
- {
106
- "id": 50267,
107
- "content": " ",
108
- "single_word": false,
109
- "lstrip": false,
110
- "rstrip": false,
111
- "normalized": true,
112
- "special": false
113
- },
114
- {
115
- "id": 50268,
116
- "content": " ",
117
- "single_word": false,
118
- "lstrip": false,
119
- "rstrip": false,
120
- "normalized": true,
121
- "special": false
122
- },
123
- {
124
- "id": 50269,
125
- "content": " ",
126
- "single_word": false,
127
- "lstrip": false,
128
- "rstrip": false,
129
- "normalized": true,
130
- "special": false
131
- },
132
- {
133
- "id": 50270,
134
- "content": " ",
135
- "single_word": false,
136
- "lstrip": false,
137
- "rstrip": false,
138
- "normalized": true,
139
- "special": false
140
- },
141
- {
142
- "id": 50271,
143
- "content": " ",
144
- "single_word": false,
145
- "lstrip": false,
146
- "rstrip": false,
147
- "normalized": true,
148
- "special": false
149
- },
150
- {
151
- "id": 50272,
152
- "content": " ",
153
- "single_word": false,
154
- "lstrip": false,
155
- "rstrip": false,
156
- "normalized": true,
157
- "special": false
158
- },
159
- {
160
- "id": 50273,
161
- "content": " ",
162
- "single_word": false,
163
- "lstrip": false,
164
- "rstrip": false,
165
- "normalized": true,
166
- "special": false
167
- },
168
- {
169
- "id": 50274,
170
- "content": " ",
171
- "single_word": false,
172
- "lstrip": false,
173
- "rstrip": false,
174
- "normalized": true,
175
- "special": false
176
- },
177
- {
178
- "id": 50275,
179
- "content": " ",
180
- "single_word": false,
181
- "lstrip": false,
182
- "rstrip": false,
183
- "normalized": true,
184
- "special": false
185
- },
186
- {
187
- "id": 50276,
188
- "content": " ",
189
- "single_word": false,
190
- "lstrip": false,
191
- "rstrip": false,
192
- "normalized": true,
193
- "special": false
194
- },
195
- {
196
- "id": 50277,
197
- "content": " ",
198
- "single_word": false,
199
- "lstrip": false,
200
- "rstrip": false,
201
- "normalized": true,
202
- "special": false
203
- },
204
- {
205
- "id": 50278,
206
- "content": " ",
207
- "single_word": false,
208
- "lstrip": false,
209
- "rstrip": false,
210
- "normalized": true,
211
- "special": false
212
- },
213
- {
214
- "id": 50279,
215
- "content": " ",
216
- "single_word": false,
217
- "lstrip": false,
218
- "rstrip": false,
219
- "normalized": true,
220
- "special": false
221
- },
222
- {
223
- "id": 50280,
224
  "content": " ",
225
  "single_word": false,
226
  "lstrip": false,
227
  "rstrip": false,
228
- "normalized": true,
229
- "special": false
230
- },
231
- {
232
- "id": 50281,
233
- "content": " ",
234
- "single_word": false,
235
- "lstrip": false,
236
- "rstrip": false,
237
- "normalized": true,
238
- "special": false
239
- },
240
- {
241
- "id": 50282,
242
- "content": " ",
243
- "single_word": false,
244
- "lstrip": false,
245
- "rstrip": false,
246
- "normalized": true,
247
- "special": false
248
- },
249
- {
250
- "id": 50283,
251
- "content": " ",
252
- "single_word": false,
253
- "lstrip": false,
254
- "rstrip": false,
255
- "normalized": true,
256
- "special": false
257
- },
258
- {
259
- "id": 50284,
260
- "content": " ",
261
- "single_word": false,
262
- "lstrip": false,
263
- "rstrip": false,
264
- "normalized": true,
265
- "special": false
266
- },
267
- {
268
- "id": 50285,
269
- "content": " ",
270
- "single_word": false,
271
- "lstrip": false,
272
- "rstrip": false,
273
- "normalized": true,
274
- "special": false
275
- },
276
- {
277
- "id": 50286,
278
- "content": " ",
279
- "single_word": false,
280
- "lstrip": false,
281
- "rstrip": false,
282
- "normalized": true,
283
- "special": false
284
- },
285
- {
286
- "id": 50287,
287
- "content": "\t\t\t\t\t\t\t\t\t",
288
- "single_word": false,
289
- "lstrip": false,
290
- "rstrip": false,
291
- "normalized": true,
292
- "special": false
293
- },
294
- {
295
- "id": 50288,
296
- "content": "\t\t\t\t\t\t\t\t",
297
- "single_word": false,
298
- "lstrip": false,
299
- "rstrip": false,
300
- "normalized": true,
301
- "special": false
302
- },
303
- {
304
- "id": 50289,
305
- "content": "\t\t\t\t\t\t\t",
306
- "single_word": false,
307
- "lstrip": false,
308
- "rstrip": false,
309
- "normalized": true,
310
- "special": false
311
- },
312
- {
313
- "id": 50290,
314
- "content": "\t\t\t\t\t\t",
315
- "single_word": false,
316
- "lstrip": false,
317
- "rstrip": false,
318
- "normalized": true,
319
- "special": false
320
- },
321
- {
322
- "id": 50291,
323
- "content": "\t\t\t\t\t",
324
- "single_word": false,
325
- "lstrip": false,
326
- "rstrip": false,
327
- "normalized": true,
328
- "special": false
329
- },
330
- {
331
- "id": 50292,
332
- "content": "\t\t\t\t",
333
- "single_word": false,
334
- "lstrip": false,
335
- "rstrip": false,
336
- "normalized": true,
337
- "special": false
338
- },
339
- {
340
- "id": 50293,
341
- "content": "\t\t\t",
342
- "single_word": false,
343
- "lstrip": false,
344
- "rstrip": false,
345
- "normalized": true,
346
- "special": false
347
- },
348
- {
349
- "id": 50294,
350
- "content": "\t\t",
351
- "single_word": false,
352
- "lstrip": false,
353
- "rstrip": false,
354
- "normalized": true,
355
- "special": false
356
  }
357
  ],
358
  "normalizer": null,
 
9
  "single_word": false,
10
  "lstrip": false,
11
  "rstrip": false,
12
+ "normalized": true,
13
  "special": true
14
  },
15
  {
16
  "id": 50257,
17
+ "content": "\t\t",
18
  "single_word": false,
19
  "lstrip": false,
20
  "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
  },
24
  {
25
  "id": 50258,
26
+ "content": " ",
27
  "single_word": false,
28
  "lstrip": false,
29
  "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
  },
33
  {
34
  "id": 50259,
35
+ "content": " ",
36
  "single_word": false,
37
  "lstrip": false,
38
  "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
  },
42
  {
43
  "id": 50260,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  "content": " ",
45
  "single_word": false,
46
  "lstrip": false,
47
  "rstrip": false,
48
+ "normalized": false,
49
+ "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  }
51
  ],
52
  "normalizer": null,
tokenizer_config.json CHANGED
@@ -1,10 +1,38 @@
1
  {
2
  "add_prefix_space": false,
3
- "bos_token": "<|endoftext|>",
4
- "eos_token": "<|endoftext|>",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  "model_max_length": 2048,
6
- "name_or_path": "Salesforce/codegen-350M-multi",
7
  "special_tokens_map_file": null,
8
- "tokenizer_class": "CodeGenTokenizer",
9
- "unk_token": "<|endoftext|>"
 
 
 
 
 
 
 
10
  }
 
1
  {
2
  "add_prefix_space": false,
3
+ "additional_special_tokens": [
4
+ "\t\t",
5
+ " ",
6
+ " ",
7
+ " "
8
+ ],
9
+ "bos_token": {
10
+ "__type": "AddedToken",
11
+ "content": "<|endoftext|>",
12
+ "lstrip": false,
13
+ "normalized": true,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ },
17
+ "eos_token": {
18
+ "__type": "AddedToken",
19
+ "content": "<|endoftext|>",
20
+ "lstrip": false,
21
+ "normalized": true,
22
+ "rstrip": false,
23
+ "single_word": false
24
+ },
25
+ "errors": "replace",
26
  "model_max_length": 2048,
27
+ "name_or_path": "flax-community/gpt-neo-125M-code-clippy",
28
  "special_tokens_map_file": null,
29
+ "tokenizer_class": "GPT2Tokenizer",
30
+ "unk_token": {
31
+ "__type": "AddedToken",
32
+ "content": "<|endoftext|>",
33
+ "lstrip": false,
34
+ "normalized": true,
35
+ "rstrip": false,
36
+ "single_word": false
37
+ }
38
  }