jtatman commited on
Commit
3a80cbf
1 Parent(s): f95286d

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +0 -7
  2. tokenizer.json +39 -3
  3. tokenizer_config.json +36 -4
special_tokens_map.json CHANGED
@@ -13,13 +13,6 @@
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
- "pad_token": {
17
- "content": "[PAD]",
18
- "lstrip": false,
19
- "normalized": false,
20
- "rstrip": false,
21
- "single_word": false
22
- },
23
  "unk_token": {
24
  "content": "<|endoftext|>",
25
  "lstrip": false,
 
13
  "rstrip": false,
14
  "single_word": false
15
  },
 
 
 
 
 
 
 
16
  "unk_token": {
17
  "content": "<|endoftext|>",
18
  "lstrip": false,
tokenizer.json CHANGED
@@ -230,12 +230,48 @@
230
  },
231
  {
232
  "id": 50277,
233
- "content": "[PAD]",
234
  "single_word": false,
235
  "lstrip": false,
236
  "rstrip": false,
237
- "normalized": false,
238
- "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
  }
240
  ],
241
  "normalizer": {
 
230
  },
231
  {
232
  "id": 50277,
233
+ "content": "<|im_start|>",
234
  "single_word": false,
235
  "lstrip": false,
236
  "rstrip": false,
237
+ "normalized": true,
238
+ "special": false
239
+ },
240
+ {
241
+ "id": 50278,
242
+ "content": "<|im_end|>",
243
+ "single_word": false,
244
+ "lstrip": false,
245
+ "rstrip": false,
246
+ "normalized": true,
247
+ "special": false
248
+ },
249
+ {
250
+ "id": 50279,
251
+ "content": "<s>",
252
+ "single_word": false,
253
+ "lstrip": false,
254
+ "rstrip": false,
255
+ "normalized": true,
256
+ "special": false
257
+ },
258
+ {
259
+ "id": 50280,
260
+ "content": "</s>",
261
+ "single_word": false,
262
+ "lstrip": false,
263
+ "rstrip": false,
264
+ "normalized": true,
265
+ "special": false
266
+ },
267
+ {
268
+ "id": 50281,
269
+ "content": "<|end_of_text|>",
270
+ "single_word": false,
271
+ "lstrip": false,
272
+ "rstrip": false,
273
+ "normalized": true,
274
+ "special": false
275
  }
276
  ],
277
  "normalizer": {
tokenizer_config.json CHANGED
@@ -204,19 +204,51 @@
204
  "special": false
205
  },
206
  "50277": {
207
- "content": "[PAD]",
208
  "lstrip": false,
209
- "normalized": false,
210
  "rstrip": false,
211
  "single_word": false,
212
- "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  }
214
  },
215
  "bos_token": "<|endoftext|>",
216
  "clean_up_tokenization_spaces": true,
217
  "eos_token": "<|endoftext|>",
218
  "model_max_length": 1000000000000000019884624838656,
219
- "pad_token": "[PAD]",
220
  "tokenizer_class": "GPTNeoXTokenizer",
221
  "unk_token": "<|endoftext|>"
222
  }
 
204
  "special": false
205
  },
206
  "50277": {
207
+ "content": "<|im_start|>",
208
  "lstrip": false,
209
+ "normalized": true,
210
  "rstrip": false,
211
  "single_word": false,
212
+ "special": false
213
+ },
214
+ "50278": {
215
+ "content": "<|im_end|>",
216
+ "lstrip": false,
217
+ "normalized": true,
218
+ "rstrip": false,
219
+ "single_word": false,
220
+ "special": false
221
+ },
222
+ "50279": {
223
+ "content": "<s>",
224
+ "lstrip": false,
225
+ "normalized": true,
226
+ "rstrip": false,
227
+ "single_word": false,
228
+ "special": false
229
+ },
230
+ "50280": {
231
+ "content": "</s>",
232
+ "lstrip": false,
233
+ "normalized": true,
234
+ "rstrip": false,
235
+ "single_word": false,
236
+ "special": false
237
+ },
238
+ "50281": {
239
+ "content": "<|end_of_text|>",
240
+ "lstrip": false,
241
+ "normalized": true,
242
+ "rstrip": false,
243
+ "single_word": false,
244
+ "special": false
245
  }
246
  },
247
  "bos_token": "<|endoftext|>",
248
  "clean_up_tokenization_spaces": true,
249
  "eos_token": "<|endoftext|>",
250
  "model_max_length": 1000000000000000019884624838656,
251
+ "pad_token": null,
252
  "tokenizer_class": "GPTNeoXTokenizer",
253
  "unk_token": "<|endoftext|>"
254
  }