rajammanabrolu commited on
Commit
fe7c483
1 Parent(s): 05b374f

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +9 -0
  2. tiktoken.py +290 -0
  3. tokenizer_config.json +24 -0
special_tokens_map.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>"
5
+ ],
6
+ "bos_token": "<|endoftext|>",
7
+ "eos_token": "<|endoftext|>",
8
+ "unk_token": "<|endoftext|>"
9
+ }
tiktoken.py ADDED
@@ -0,0 +1,290 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2022 MosaicML LLM Foundry authors
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from typing import Any, Dict, List, Optional, Tuple, Union
5
+
6
+ import torch
7
+ from transformers import PreTrainedTokenizer
8
+
9
+
10
+ class TiktokenTokenizerWrapper(PreTrainedTokenizer):
11
+ """A thin wrapper around tiktoken to make it compatible with Hugging Face.
12
+
13
+ tokenizers.
14
+
15
+ See HuggingFace for further documentation on general tokenizer methods.
16
+ """
17
+
18
+ model_input_names = ['input_ids', 'attention_mask']
19
+
20
+ def __init__(self,
21
+ model_name: Optional[str] = None,
22
+ encoding_name: Optional[str] = None,
23
+ add_bos_token: bool = False,
24
+ add_eos_token: bool = False,
25
+ unk_token: Optional[str] = '<|endoftext|>',
26
+ eos_token: Optional[str] = '<|endoftext|>',
27
+ bos_token: Optional[str] = '<|endoftext|>',
28
+ pad_token: Optional[str] = None,
29
+ **kwargs: Dict[str, Any]):
30
+ """Constructor creates a tiktoken tokenizer to use as the underlying.
31
+
32
+ tokenizer.
33
+
34
+ Args:
35
+ model_name (Optional[str], optional): The name of the model to load from tiktoken. Defaults to None.
36
+ Either model_name or encoding_name must be set, but not both.
37
+ encoding_name (Optional[str], optional): The name of the encoding to load from tiktoken. Defaults to None.
38
+ Either model_name or encoding_name must be set, but not both.
39
+ add_bos_token (bool, optional): Whether to add bos tokens. Defaults to False.
40
+ add_eos_token (bool, optional): Whether to add eos tokens. Defaults to False.
41
+ unk_token (Optional[str], optional): The unk token. Defaults to '<|endoftext|>'.
42
+ eos_token (Optional[str], optional): The eos token. Defaults to '<|endoftext|>'.
43
+ bos_token (Optional[str], optional): The bos token. Defaults to '<|endoftext|>'.
44
+ pad_token (Optional[str], optional): The pad token. Defaults to None.
45
+ """
46
+ try:
47
+ import tiktoken
48
+ except:
49
+ raise ImportError(
50
+ 'You need to install tiktoken to use TiktokenTokenizerWrapper.')
51
+
52
+ if model_name is not None and encoding_name is not None:
53
+ raise ValueError(
54
+ 'You need to specify either model_name or encoding_name, not both.'
55
+ )
56
+
57
+ self.model_name = model_name
58
+ self.encoding_name = encoding_name
59
+
60
+ if self.model_name is not None:
61
+ self.encoding = tiktoken.encoding_for_model( # type: ignore (thirdParty)
62
+ self.model_name)
63
+ elif self.encoding_name is not None:
64
+ self.encoding = tiktoken.get_encoding( # type: ignore (thirdParty)
65
+ self.encoding_name)
66
+ else:
67
+ raise ValueError(
68
+ 'You need to specify either model_name or encoding_name.')
69
+
70
+ self.add_bos_token = add_bos_token
71
+ self.add_eos_token = add_eos_token
72
+
73
+ super().__init__(model_name=model_name,
74
+ encoding_name=encoding_name,
75
+ add_bos_token=add_bos_token,
76
+ add_eos_token=add_eos_token,
77
+ unk_token=unk_token,
78
+ eos_token=eos_token,
79
+ bos_token=bos_token,
80
+ pad_token=pad_token,
81
+ **kwargs)
82
+
83
+ @property
84
+ def vocab_size(self) -> int:
85
+ """Returns vocab size."""
86
+ return self.encoding.n_vocab
87
+
88
+ @property
89
+ def is_fast(self) -> bool:
90
+ return False
91
+
92
+ def get_vocab(self) -> Dict[str, int]:
93
+ """Returns vocab as a dict."""
94
+ vocab = {}
95
+ for i in range(self.vocab_size):
96
+ try:
97
+ # need to try this first, so that we get a proper KeyError,
98
+ # otherwise it crashes in the rust code
99
+ _ = self.encoding.decode_single_token_bytes(i)
100
+ vocab[self.encoding.decode([i])] = i
101
+ except KeyError:
102
+ pass
103
+
104
+ return vocab
105
+
106
+ def _tokenize(self, text: str) -> List[int]:
107
+ """Returns a tokenized string.
108
+
109
+ Note: We have slightly redefined the expected contract between this method and
110
+ the _convert_token_to_id method. Normally, this method turns a string, into a list of strings,
111
+ and then the _convert_token_to_id method turns that list of strings into a list of integers.
112
+ However, not all vocab indices can be decoded into a string, so instead we just return the integers
113
+ from this function, and have adjusted the _convert_token_to_id method to handle integers as well as strings.
114
+ The only use of _tokenize that I could find was in this way, so this _should_ be safe.
115
+ """
116
+ if not isinstance(text, str):
117
+ raise ValueError(
118
+ f'Expected a string input to _tokenize but got {type(text)}.')
119
+
120
+ tokens = [t for t in self.encoding.encode(text, allowed_special='all')]
121
+
122
+ return tokens
123
+
124
+ def _convert_token_to_id(self, token: Union[int, str]) -> int:
125
+ """Converts a token (str) into an id using the vocab."""
126
+ if isinstance(token, int):
127
+ return token
128
+
129
+ return self.encoding.encode(token, allowed_special='all')[0]
130
+
131
+ def _convert_id_to_token(self, index: int) -> str:
132
+ """Converts an index (integer) into a token (str) using the vocab."""
133
+ return self.encoding.decode([index])
134
+
135
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
136
+ """Converts a sequence of tokens (string) in a single string."""
137
+ return ''.join(tokens)
138
+
139
+ def convert_ids_to_tokens(
140
+ self,
141
+ ids: Union[int, List[int]],
142
+ skip_special_tokens: bool = False) -> Union[str, List[str]]:
143
+ """Converts a single index or a sequence of indices into a token or a.
144
+
145
+ sequence of tokens, using the vocabulary and added tokens.
146
+
147
+ Args:
148
+ ids (`int` or `List[int]`):
149
+ The token id (or token ids) to convert to tokens.
150
+ skip_special_tokens (`bool`, *optional*, defaults to `False`):
151
+ Whether or not to remove special tokens in the decoding.
152
+
153
+ Returns:
154
+ `str` or `List[str]`: The decoded token(s).
155
+ """
156
+ if isinstance(ids, int):
157
+ if ids in self.added_tokens_decoder:
158
+ return self.added_tokens_decoder[ids]
159
+
160
+ return self._convert_id_to_token(ids)
161
+
162
+ # current_stream will collect multiple tokens, and then separately add items
163
+ # for each added token. This is done so that decode works properly with token ids
164
+ # that cannot be represented naively in utf-8.
165
+ tokens = []
166
+ current_stream = []
167
+ for index in ids:
168
+ if skip_special_tokens and index in self.all_special_ids:
169
+ continue
170
+
171
+ if index in self.added_tokens_decoder:
172
+ tokens.append(self.encoding.decode(current_stream))
173
+ current_stream = []
174
+ tokens.append(self.added_tokens_decoder[index])
175
+ else:
176
+ current_stream.append(index)
177
+
178
+ if len(current_stream) > 0:
179
+ tokens.append(self.encoding.decode(current_stream))
180
+ return tokens
181
+
182
+ def build_inputs_with_special_tokens(
183
+ self,
184
+ token_ids_0: List[int],
185
+ token_ids_1: Optional[List[int]] = None) -> List[int]:
186
+ bos_token_id = [self.bos_token_id] if self.add_bos_token else []
187
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
188
+
189
+ output = bos_token_id + token_ids_0 + eos_token_id
190
+
191
+ if token_ids_1 is not None:
192
+ output = output + bos_token_id + token_ids_1 + eos_token_id
193
+
194
+ return output
195
+
196
+ def get_special_tokens_mask(
197
+ self,
198
+ token_ids_0: List[int],
199
+ token_ids_1: Optional[List[int]] = None,
200
+ already_has_special_tokens: bool = False) -> List[int]:
201
+ """Retrieves sequence ids from a token list that has no special tokens.
202
+
203
+ Function copied from
204
+ https://github.com/huggingface/transformers/blob/e3a4bd2bee212a2d0fd9f03b27fe7bfc1debe42d/src/transformers/models/gpt2/tokenization_gpt2.py#L265-L295
205
+
206
+ added. This method is called when adding special tokens using the
207
+ tokenizer `prepare_for_model` or `encode_plus` methods.
208
+
209
+ Args:
210
+ token_ids_0 (`List[int]`):
211
+ List of IDs.
212
+ token_ids_1 (`List[int]`, *optional*):
213
+ Optional second list of IDs for sequence pairs.
214
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
215
+ Whether or not the token list is already formatted with special tokens for the model.
216
+
217
+ Returns:
218
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
219
+ """
220
+ if already_has_special_tokens:
221
+ return super().get_special_tokens_mask(
222
+ token_ids_0=token_ids_0,
223
+ token_ids_1=token_ids_1,
224
+ already_has_special_tokens=True)
225
+
226
+ bos_token_id = [1] if self.add_bos_token else []
227
+ eos_token_id = [1] if self.add_eos_token else []
228
+
229
+ if token_ids_1 is None:
230
+ return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
231
+ return (bos_token_id + ([0] * len(token_ids_0)) + eos_token_id +
232
+ bos_token_id + ([0] * len(token_ids_1)) + eos_token_id)
233
+
234
+ def create_token_type_ids_from_sequences(
235
+ self,
236
+ token_ids_0: List[int],
237
+ token_ids_1: Optional[List[int]] = None) -> List[int]:
238
+ sep = [self.sep_token_id]
239
+
240
+ if token_ids_1 is None:
241
+ return len(token_ids_0 + sep) * [0]
242
+ return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
243
+
244
+ def save_vocabulary(self,
245
+ save_directory: str,
246
+ filename_prefix: Optional[str] = None) -> Tuple[str]:
247
+
248
+ # ignore the below type to keep the original signature
249
+ # we are knowingly breaking the signature here, although not 100% certain
250
+ # it doesn't have side effects
251
+ # There is some code in huggingface that calls this function to get the vocab files,
252
+ # but it doesn't seem to access them (or at least checks for their existence
253
+ # before accessing them)
254
+ return (None, None) # type: ignore
255
+
256
+ def sanitize_special_tokens(self) -> int:
257
+ """Make sure that all the special tokens attributes of the tokenizer.
258
+
259
+ (`tokenizer.mask_token`, `tokenizer.cls_token`, etc.) are in the
260
+ vocabulary.
261
+
262
+ Add the missing ones to the vocabulary if needed.
263
+
264
+ Return:
265
+ `int`: The number of tokens added in the vocabulary during the operation.
266
+ """
267
+ actual_new_tokens = []
268
+ for token in self.all_special_tokens_extended:
269
+ encoded = self.encoding.encode(token, allowed_special='all')
270
+ if len(encoded) > 1:
271
+ actual_new_tokens.append(token)
272
+
273
+ return self.add_tokens(actual_new_tokens, special_tokens=True)
274
+
275
+ def construct_logit_tensor(self, logprobs: Dict[str,
276
+ float]) -> torch.Tensor:
277
+ """Construct tensor of shape (vocab_size,) mapping words to logprobs.
278
+
279
+ Args:
280
+ logprobs (Dict[str, float]): Dictionary mapping tokens to log probabilities assigned to them by the model.
281
+ """
282
+ tensor = torch.tensor([min(logprobs.values()) - 1] * (self.vocab_size))
283
+ for k in logprobs:
284
+ encoding = self(k)['input_ids']
285
+ idx = encoding[0]
286
+ tensor[idx] = logprobs[k]
287
+ return tensor
288
+
289
+
290
+ TiktokenTokenizerWrapper.register_for_auto_class()
tokenizer_config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": false,
5
+ "additional_special_tokens": [
6
+ "<|im_start|>",
7
+ "<|im_end|>"
8
+ ],
9
+ "auto_map": {
10
+ "AutoTokenizer": [
11
+ "tiktoken.TiktokenTokenizerWrapper",
12
+ null
13
+ ]
14
+ },
15
+ "bos_token": "<|endoftext|>",
16
+ "clean_up_tokenization_spaces": true,
17
+ "encoding_name": null,
18
+ "eos_token": "<|endoftext|>",
19
+ "model_max_length": 8192,
20
+ "model_name": "gpt-4",
21
+ "pad_token": null,
22
+ "tokenizer_class": "TiktokenTokenizerWrapper",
23
+ "unk_token": "<|endoftext|>"
24
+ }