Arnav0400 commited on
Commit
3ea6ec1
·
1 Parent(s): e986386

add default tokenization scripts

Browse files
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "unk_token": {
17
+ "content": "<unk>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
tokenization_llama.py ADDED
@@ -0,0 +1,396 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+
21
+ """Tokenization classes for LLaMA."""
22
+ import os
23
+ from shutil import copyfile
24
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
25
+
26
+ import sentencepiece as spm
27
+
28
+ from ...tokenization_utils import AddedToken, PreTrainedTokenizer
29
+ from ...utils import logging
30
+
31
+
32
+ if TYPE_CHECKING:
33
+ from ...pipelines.conversational import Conversation
34
+ from ...tokenization_utils_base import TextInput
35
+
36
+ logger = logging.get_logger(__name__)
37
+
38
+ VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
39
+
40
+ PRETRAINED_VOCAB_FILES_MAP = {
41
+ "vocab_file": {
42
+ "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model",
43
+ },
44
+ "tokenizer_file": {
45
+ "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer_config.json",
46
+ },
47
+ }
48
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
49
+ "hf-internal-testing/llama-tokenizer": 2048,
50
+ }
51
+ SPIECE_UNDERLINE = "▁"
52
+
53
+ B_INST, E_INST = "[INST]", "[/INST]"
54
+ B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
55
+
56
+ # fmt: off
57
+ DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your \
58
+ answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure\
59
+ that your responses are socially unbiased and positive in nature.
60
+
61
+ If a question does not make any sense, or is not factually coherent, explain why instead of answering something not \
62
+ correct. If you don't know the answer to a question, please don't share false information."""
63
+ # fmt: on
64
+
65
+
66
+ class LlamaTokenizer(PreTrainedTokenizer):
67
+ """
68
+ Construct a Llama tokenizer. Based on byte-level Byte-Pair-Encoding. The default padding token is unset as there is
69
+ no padding token in the original model.
70
+
71
+ Args:
72
+ vocab_file (`str`):
73
+ Path to the vocabulary file.
74
+ legacy (`bool`, *optional*, defaults to `True`):
75
+ Whether or not the `legacy` behaviour of the tokenizer should be used. Legacy is before the merge of #24622
76
+ which includes fixes to properly handle tokens that appear after special tokens. A simple example:
77
+
78
+ - `legacy=True`:
79
+ ```python
80
+ >>> from transformers import T5Tokenizer
81
+
82
+ >>> tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=True)
83
+ >>> tokenizer.encode("Hello <extra_id_0>.")
84
+ [8774, 32099, 3, 5, 1]
85
+ ```
86
+ - `legacy=False`:
87
+ ```python
88
+ >>> from transformers import T5Tokenizer
89
+
90
+ >>> tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=False)
91
+ >>> tokenizer.encode("Hello <extra_id_0>.") # the extra space `[3]` is no longer here
92
+ [8774, 32099, 5, 1]
93
+ ```
94
+ Checkout the pull request and the issue [here](https://github.com/huggingface/transformers/pull/24565) for
95
+ more details.
96
+
97
+ """
98
+
99
+ vocab_files_names = VOCAB_FILES_NAMES
100
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
101
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
102
+ model_input_names = ["input_ids", "attention_mask"]
103
+
104
+ def __init__(
105
+ self,
106
+ vocab_file,
107
+ unk_token="<unk>",
108
+ bos_token="<s>",
109
+ eos_token="</s>",
110
+ pad_token=None,
111
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
112
+ add_bos_token=True,
113
+ add_eos_token=False,
114
+ clean_up_tokenization_spaces=False,
115
+ legacy=None,
116
+ **kwargs,
117
+ ):
118
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
119
+ bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
120
+ eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
121
+ unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
122
+ pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
123
+ super().__init__(
124
+ bos_token=bos_token,
125
+ eos_token=eos_token,
126
+ unk_token=unk_token,
127
+ pad_token=pad_token,
128
+ add_bos_token=add_bos_token,
129
+ add_eos_token=add_eos_token,
130
+ sp_model_kwargs=self.sp_model_kwargs,
131
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
132
+ legacy=legacy,
133
+ **kwargs,
134
+ )
135
+ if legacy is None:
136
+ logger.warning_once(
137
+ f"You are using the default legacy behaviour of the {self.__class__}. This means that tokens that come after special tokens will not be properly handled. We recommend you to"
138
+ " read the related pull request available at https://github.com/huggingface/transformers/pull/24565, and set the legacy attribute accordingly."
139
+ )
140
+ legacy = True
141
+
142
+ self.legacy = legacy
143
+ self.vocab_file = vocab_file
144
+ self.add_bos_token = add_bos_token
145
+ self.add_eos_token = add_eos_token
146
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
147
+ self.sp_model.Load(vocab_file)
148
+
149
+ def __getstate__(self):
150
+ state = self.__dict__.copy()
151
+ state["sp_model"] = None
152
+ state["sp_model_proto"] = self.sp_model.serialized_model_proto()
153
+ return state
154
+
155
+ def __setstate__(self, d):
156
+ self.__dict__ = d
157
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
158
+ self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
159
+
160
+ @property
161
+ def vocab_size(self):
162
+ """Returns vocab size"""
163
+ return self.sp_model.get_piece_size()
164
+
165
+ def get_vocab(self):
166
+ """Returns vocab as a dict"""
167
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
168
+ vocab.update(self.added_tokens_encoder)
169
+ return vocab
170
+
171
+ # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.tokenize
172
+ def tokenize(self, text: "TextInput", **kwargs) -> List[str]:
173
+ # Replace the SPIECE_UNDERLINE with a space to make sure SPIECE_UNDERLINE is only used at
174
+ # the beginning of the text
175
+ if not self.legacy:
176
+ text = SPIECE_UNDERLINE + text.replace(SPIECE_UNDERLINE, " ")
177
+ return super().tokenize(text, **kwargs)
178
+
179
+ # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer._tokenize
180
+ def _tokenize(self, text, **kwargs):
181
+ """
182
+ Returns a tokenized string.
183
+
184
+ Since the sentencepiece internal model always adds a SPIECE_UNDERLINE, at the beginning of the provided text,
185
+ we need to remove it by hand when the current text is a subsequence. This happens whenever the `self.tokenize`
186
+ function is called with specials tokens: the input is split on the special tokens, and each subsequence is
187
+ passed to `_tokenize`. Thus if a subsequence did not start with a `" "` or SPIECE_UNDERLINE, we have to remove
188
+ the extra `SPIECE_UNDERLINE` prepended.
189
+ """
190
+ if not self.legacy:
191
+ is_first = text.startswith(SPIECE_UNDERLINE)
192
+ if is_first:
193
+ text = text[1:]
194
+
195
+ tokens = self.sp_model.encode(text, out_type=str)
196
+
197
+ if not self.legacy and not is_first and not text.startswith(" ") and tokens[0].startswith(SPIECE_UNDERLINE):
198
+ tokens = ([tokens[0][1:]] if len(tokens[0]) > 1 else []) + tokens[1:]
199
+ return tokens
200
+
201
+ def _convert_token_to_id(self, token):
202
+ """Converts a token (str) in an id using the vocab."""
203
+ return self.sp_model.piece_to_id(token)
204
+
205
+ def _convert_id_to_token(self, index):
206
+ """Converts an index (integer) in a token (str) using the vocab."""
207
+ token = self.sp_model.IdToPiece(index)
208
+ return token
209
+
210
+ def convert_tokens_to_string(self, tokens):
211
+ """Converts a sequence of tokens (string) in a single string."""
212
+ current_sub_tokens = []
213
+ out_string = ""
214
+ prev_is_special = False
215
+ for i, token in enumerate(tokens):
216
+ # make sure that special tokens are not decoded using sentencepiece model
217
+ if token in self.all_special_tokens:
218
+ if not prev_is_special and i != 0:
219
+ out_string += " "
220
+ out_string += self.sp_model.decode(current_sub_tokens) + token
221
+ prev_is_special = True
222
+ current_sub_tokens = []
223
+ else:
224
+ current_sub_tokens.append(token)
225
+ prev_is_special = False
226
+ out_string += self.sp_model.decode(current_sub_tokens)
227
+ return out_string
228
+
229
+ def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
230
+ """
231
+ Save the vocabulary and special tokens file to a directory.
232
+
233
+ Args:
234
+ save_directory (`str`):
235
+ The directory in which to save the vocabulary.
236
+
237
+ Returns:
238
+ `Tuple(str)`: Paths to the files saved.
239
+ """
240
+ if not os.path.isdir(save_directory):
241
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
242
+ return
243
+ out_vocab_file = os.path.join(
244
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
245
+ )
246
+
247
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
248
+ copyfile(self.vocab_file, out_vocab_file)
249
+ elif not os.path.isfile(self.vocab_file):
250
+ with open(out_vocab_file, "wb") as fi:
251
+ content_spiece_model = self.sp_model.serialized_model_proto()
252
+ fi.write(content_spiece_model)
253
+
254
+ return (out_vocab_file,)
255
+
256
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
257
+ bos_token_id = [self.bos_token_id] if self.add_bos_token else []
258
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
259
+
260
+ output = bos_token_id + token_ids_0 + eos_token_id
261
+
262
+ if token_ids_1 is not None:
263
+ output = output + bos_token_id + token_ids_1 + eos_token_id
264
+
265
+ return output
266
+
267
+ def get_special_tokens_mask(
268
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
269
+ ) -> List[int]:
270
+ """
271
+ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
272
+ special tokens using the tokenizer `prepare_for_model` method.
273
+
274
+ Args:
275
+ token_ids_0 (`List[int]`):
276
+ List of IDs.
277
+ token_ids_1 (`List[int]`, *optional*):
278
+ Optional second list of IDs for sequence pairs.
279
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
280
+ Whether or not the token list is already formatted with special tokens for the model.
281
+
282
+ Returns:
283
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
284
+ """
285
+ if already_has_special_tokens:
286
+ return super().get_special_tokens_mask(
287
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
288
+ )
289
+
290
+ bos_token_id = [1] if self.add_bos_token else []
291
+ eos_token_id = [1] if self.add_eos_token else []
292
+
293
+ if token_ids_1 is None:
294
+ return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
295
+ return (
296
+ bos_token_id
297
+ + ([0] * len(token_ids_0))
298
+ + eos_token_id
299
+ + bos_token_id
300
+ + ([0] * len(token_ids_1))
301
+ + eos_token_id
302
+ )
303
+
304
+ def create_token_type_ids_from_sequences(
305
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
306
+ ) -> List[int]:
307
+ """
308
+ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
309
+ sequence pair mask has the following format:
310
+
311
+ ```
312
+ 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
313
+ | first sequence | second sequence |
314
+ ```
315
+
316
+ if token_ids_1 is None, only returns the first portion of the mask (0s).
317
+
318
+ Args:
319
+ token_ids_0 (`List[int]`):
320
+ List of ids.
321
+ token_ids_1 (`List[int]`, *optional*):
322
+ Optional second list of IDs for sequence pairs.
323
+
324
+ Returns:
325
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
326
+ """
327
+ bos_token_id = [self.bos_token_id] if self.add_bos_token else []
328
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
329
+
330
+ output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
331
+
332
+ if token_ids_1 is not None:
333
+ output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
334
+
335
+ return output
336
+
337
+ def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
338
+ r"""Builds the input ids for a conversation.
339
+ This is the format used in the provided examples. System prompts should be manually added at the beginning of
340
+ the conversation. If no system prompt is given, the `DEFAULT_SYSTEM_PROMPT` will be used.
341
+ ```
342
+ <bos>[INST] B_SYS SytemPrompt E_SYS Prompt [/INST] Answer <eos>
343
+ <bos>[INST] Prompt [/INST] Answer <eos>
344
+ <bos>[INST] Prompt [/INST]
345
+ ```
346
+
347
+ If you want to use your own system prompt, make sure to use both `B_SYS` and `E_SYS` use the following:
348
+ ```python
349
+ >>> from transformers import Conversation
350
+
351
+ >>> Conversation(
352
+ ... "<<SYS>>\n Only answer with emojis, and charades\n<</SYS>>\n\nHow can I build a house in 10 septs?"
353
+ ... ) # doctest: +IGNORE_RESULT
354
+ ```
355
+ Args:
356
+ conversation (`Conversation`):
357
+ Conversation to build input ids for.
358
+ Returns:
359
+ `List[int]`:
360
+ Input ids for the conversation.
361
+ """
362
+ if len(conversation.past_user_inputs) > 0:
363
+ if not conversation.past_user_inputs[0].startswith(B_SYS) or E_SYS not in conversation.past_user_inputs[0]:
364
+ conversation.past_user_inputs[0] = (
365
+ B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.past_user_inputs[0]
366
+ )
367
+ elif conversation.new_user_input:
368
+ if not conversation.new_user_input.startswith(B_SYS) or E_SYS not in conversation.new_user_input:
369
+ conversation.new_user_input = B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.new_user_input
370
+ else:
371
+ raise ValueError("Last message must be from user")
372
+
373
+ dialogue = list(conversation.iter_texts())
374
+ if not all([is_user for is_user, msg in dialogue[::2]]) or not all(
375
+ [not is_user for is_user, msg in dialogue[1::2]]
376
+ ):
377
+ raise ValueError(
378
+ "The model only supports 'user' and 'assistant' roles, starting with user and alternating (u/a/u/a/u...)"
379
+ )
380
+
381
+ dialog_tokens: List[int] = []
382
+ dialog_tokens += sum(
383
+ [
384
+ [self.bos_token_id]
385
+ + self.encode(
386
+ f"{B_INST} {(prompt[1]).strip()} {E_INST} {(answer[1]).strip()} ", add_special_tokens=False
387
+ )
388
+ + [self.eos_token_id]
389
+ for prompt, answer in zip(dialogue[::2], dialogue[1::2])
390
+ ],
391
+ [],
392
+ )
393
+ dialog_tokens += [self.bos_token_id] + self.encode(
394
+ f"{B_INST} {(dialogue[-1][1]).strip()} {E_INST}", add_special_tokens=False
395
+ )
396
+ return dialog_tokens
tokenization_llama_fast.py ADDED
@@ -0,0 +1,396 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+
21
+ """Tokenization classes for LLaMA."""
22
+ import os
23
+ from shutil import copyfile
24
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
25
+
26
+ import sentencepiece as spm
27
+
28
+ from ...tokenization_utils import AddedToken, PreTrainedTokenizer
29
+ from ...utils import logging
30
+
31
+
32
+ if TYPE_CHECKING:
33
+ from ...pipelines.conversational import Conversation
34
+ from ...tokenization_utils_base import TextInput
35
+
36
+ logger = logging.get_logger(__name__)
37
+
38
+ VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
39
+
40
+ PRETRAINED_VOCAB_FILES_MAP = {
41
+ "vocab_file": {
42
+ "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model",
43
+ },
44
+ "tokenizer_file": {
45
+ "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer_config.json",
46
+ },
47
+ }
48
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
49
+ "hf-internal-testing/llama-tokenizer": 2048,
50
+ }
51
+ SPIECE_UNDERLINE = "▁"
52
+
53
+ B_INST, E_INST = "[INST]", "[/INST]"
54
+ B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
55
+
56
+ # fmt: off
57
+ DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your \
58
+ answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure\
59
+ that your responses are socially unbiased and positive in nature.
60
+
61
+ If a question does not make any sense, or is not factually coherent, explain why instead of answering something not \
62
+ correct. If you don't know the answer to a question, please don't share false information."""
63
+ # fmt: on
64
+
65
+
66
+ class LlamaTokenizer(PreTrainedTokenizer):
67
+ """
68
+ Construct a Llama tokenizer. Based on byte-level Byte-Pair-Encoding. The default padding token is unset as there is
69
+ no padding token in the original model.
70
+
71
+ Args:
72
+ vocab_file (`str`):
73
+ Path to the vocabulary file.
74
+ legacy (`bool`, *optional*, defaults to `True`):
75
+ Whether or not the `legacy` behaviour of the tokenizer should be used. Legacy is before the merge of #24622
76
+ which includes fixes to properly handle tokens that appear after special tokens. A simple example:
77
+
78
+ - `legacy=True`:
79
+ ```python
80
+ >>> from transformers import T5Tokenizer
81
+
82
+ >>> tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=True)
83
+ >>> tokenizer.encode("Hello <extra_id_0>.")
84
+ [8774, 32099, 3, 5, 1]
85
+ ```
86
+ - `legacy=False`:
87
+ ```python
88
+ >>> from transformers import T5Tokenizer
89
+
90
+ >>> tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=False)
91
+ >>> tokenizer.encode("Hello <extra_id_0>.") # the extra space `[3]` is no longer here
92
+ [8774, 32099, 5, 1]
93
+ ```
94
+ Checkout the pull request and the issue [here](https://github.com/huggingface/transformers/pull/24565) for
95
+ more details.
96
+
97
+ """
98
+
99
+ vocab_files_names = VOCAB_FILES_NAMES
100
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
101
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
102
+ model_input_names = ["input_ids", "attention_mask"]
103
+
104
+ def __init__(
105
+ self,
106
+ vocab_file,
107
+ unk_token="<unk>",
108
+ bos_token="<s>",
109
+ eos_token="</s>",
110
+ pad_token=None,
111
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
112
+ add_bos_token=True,
113
+ add_eos_token=False,
114
+ clean_up_tokenization_spaces=False,
115
+ legacy=None,
116
+ **kwargs,
117
+ ):
118
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
119
+ bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
120
+ eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
121
+ unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
122
+ pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
123
+ super().__init__(
124
+ bos_token=bos_token,
125
+ eos_token=eos_token,
126
+ unk_token=unk_token,
127
+ pad_token=pad_token,
128
+ add_bos_token=add_bos_token,
129
+ add_eos_token=add_eos_token,
130
+ sp_model_kwargs=self.sp_model_kwargs,
131
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
132
+ legacy=legacy,
133
+ **kwargs,
134
+ )
135
+ if legacy is None:
136
+ logger.warning_once(
137
+ f"You are using the default legacy behaviour of the {self.__class__}. This means that tokens that come after special tokens will not be properly handled. We recommend you to"
138
+ " read the related pull request available at https://github.com/huggingface/transformers/pull/24565, and set the legacy attribute accordingly."
139
+ )
140
+ legacy = True
141
+
142
+ self.legacy = legacy
143
+ self.vocab_file = vocab_file
144
+ self.add_bos_token = add_bos_token
145
+ self.add_eos_token = add_eos_token
146
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
147
+ self.sp_model.Load(vocab_file)
148
+
149
+ def __getstate__(self):
150
+ state = self.__dict__.copy()
151
+ state["sp_model"] = None
152
+ state["sp_model_proto"] = self.sp_model.serialized_model_proto()
153
+ return state
154
+
155
+ def __setstate__(self, d):
156
+ self.__dict__ = d
157
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
158
+ self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
159
+
160
+ @property
161
+ def vocab_size(self):
162
+ """Returns vocab size"""
163
+ return self.sp_model.get_piece_size()
164
+
165
+ def get_vocab(self):
166
+ """Returns vocab as a dict"""
167
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
168
+ vocab.update(self.added_tokens_encoder)
169
+ return vocab
170
+
171
+ # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.tokenize
172
+ def tokenize(self, text: "TextInput", **kwargs) -> List[str]:
173
+ # Replace the SPIECE_UNDERLINE with a space to make sure SPIECE_UNDERLINE is only used at
174
+ # the beginning of the text
175
+ if not self.legacy:
176
+ text = SPIECE_UNDERLINE + text.replace(SPIECE_UNDERLINE, " ")
177
+ return super().tokenize(text, **kwargs)
178
+
179
+ # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer._tokenize
180
+ def _tokenize(self, text, **kwargs):
181
+ """
182
+ Returns a tokenized string.
183
+
184
+ Since the sentencepiece internal model always adds a SPIECE_UNDERLINE, at the beginning of the provided text,
185
+ we need to remove it by hand when the current text is a subsequence. This happens whenever the `self.tokenize`
186
+ function is called with specials tokens: the input is split on the special tokens, and each subsequence is
187
+ passed to `_tokenize`. Thus if a subsequence did not start with a `" "` or SPIECE_UNDERLINE, we have to remove
188
+ the extra `SPIECE_UNDERLINE` prepended.
189
+ """
190
+ if not self.legacy:
191
+ is_first = text.startswith(SPIECE_UNDERLINE)
192
+ if is_first:
193
+ text = text[1:]
194
+
195
+ tokens = self.sp_model.encode(text, out_type=str)
196
+
197
+ if not self.legacy and not is_first and not text.startswith(" ") and tokens[0].startswith(SPIECE_UNDERLINE):
198
+ tokens = ([tokens[0][1:]] if len(tokens[0]) > 1 else []) + tokens[1:]
199
+ return tokens
200
+
201
+ def _convert_token_to_id(self, token):
202
+ """Converts a token (str) in an id using the vocab."""
203
+ return self.sp_model.piece_to_id(token)
204
+
205
+ def _convert_id_to_token(self, index):
206
+ """Converts an index (integer) in a token (str) using the vocab."""
207
+ token = self.sp_model.IdToPiece(index)
208
+ return token
209
+
210
+ def convert_tokens_to_string(self, tokens):
211
+ """Converts a sequence of tokens (string) in a single string."""
212
+ current_sub_tokens = []
213
+ out_string = ""
214
+ prev_is_special = False
215
+ for i, token in enumerate(tokens):
216
+ # make sure that special tokens are not decoded using sentencepiece model
217
+ if token in self.all_special_tokens:
218
+ if not prev_is_special and i != 0:
219
+ out_string += " "
220
+ out_string += self.sp_model.decode(current_sub_tokens) + token
221
+ prev_is_special = True
222
+ current_sub_tokens = []
223
+ else:
224
+ current_sub_tokens.append(token)
225
+ prev_is_special = False
226
+ out_string += self.sp_model.decode(current_sub_tokens)
227
+ return out_string
228
+
229
+ def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
230
+ """
231
+ Save the vocabulary and special tokens file to a directory.
232
+
233
+ Args:
234
+ save_directory (`str`):
235
+ The directory in which to save the vocabulary.
236
+
237
+ Returns:
238
+ `Tuple(str)`: Paths to the files saved.
239
+ """
240
+ if not os.path.isdir(save_directory):
241
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
242
+ return
243
+ out_vocab_file = os.path.join(
244
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
245
+ )
246
+
247
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
248
+ copyfile(self.vocab_file, out_vocab_file)
249
+ elif not os.path.isfile(self.vocab_file):
250
+ with open(out_vocab_file, "wb") as fi:
251
+ content_spiece_model = self.sp_model.serialized_model_proto()
252
+ fi.write(content_spiece_model)
253
+
254
+ return (out_vocab_file,)
255
+
256
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
257
+ bos_token_id = [self.bos_token_id] if self.add_bos_token else []
258
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
259
+
260
+ output = bos_token_id + token_ids_0 + eos_token_id
261
+
262
+ if token_ids_1 is not None:
263
+ output = output + bos_token_id + token_ids_1 + eos_token_id
264
+
265
+ return output
266
+
267
+ def get_special_tokens_mask(
268
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
269
+ ) -> List[int]:
270
+ """
271
+ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
272
+ special tokens using the tokenizer `prepare_for_model` method.
273
+
274
+ Args:
275
+ token_ids_0 (`List[int]`):
276
+ List of IDs.
277
+ token_ids_1 (`List[int]`, *optional*):
278
+ Optional second list of IDs for sequence pairs.
279
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
280
+ Whether or not the token list is already formatted with special tokens for the model.
281
+
282
+ Returns:
283
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
284
+ """
285
+ if already_has_special_tokens:
286
+ return super().get_special_tokens_mask(
287
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
288
+ )
289
+
290
+ bos_token_id = [1] if self.add_bos_token else []
291
+ eos_token_id = [1] if self.add_eos_token else []
292
+
293
+ if token_ids_1 is None:
294
+ return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
295
+ return (
296
+ bos_token_id
297
+ + ([0] * len(token_ids_0))
298
+ + eos_token_id
299
+ + bos_token_id
300
+ + ([0] * len(token_ids_1))
301
+ + eos_token_id
302
+ )
303
+
304
+ def create_token_type_ids_from_sequences(
305
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
306
+ ) -> List[int]:
307
+ """
308
+ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
309
+ sequence pair mask has the following format:
310
+
311
+ ```
312
+ 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
313
+ | first sequence | second sequence |
314
+ ```
315
+
316
+ if token_ids_1 is None, only returns the first portion of the mask (0s).
317
+
318
+ Args:
319
+ token_ids_0 (`List[int]`):
320
+ List of ids.
321
+ token_ids_1 (`List[int]`, *optional*):
322
+ Optional second list of IDs for sequence pairs.
323
+
324
+ Returns:
325
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
326
+ """
327
+ bos_token_id = [self.bos_token_id] if self.add_bos_token else []
328
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
329
+
330
+ output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
331
+
332
+ if token_ids_1 is not None:
333
+ output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
334
+
335
+ return output
336
+
337
+ def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
338
+ r"""Builds the input ids for a conversation.
339
+ This is the format used in the provided examples. System prompts should be manually added at the beginning of
340
+ the conversation. If no system prompt is given, the `DEFAULT_SYSTEM_PROMPT` will be used.
341
+ ```
342
+ <bos>[INST] B_SYS SytemPrompt E_SYS Prompt [/INST] Answer <eos>
343
+ <bos>[INST] Prompt [/INST] Answer <eos>
344
+ <bos>[INST] Prompt [/INST]
345
+ ```
346
+
347
+ If you want to use your own system prompt, make sure to use both `B_SYS` and `E_SYS` use the following:
348
+ ```python
349
+ >>> from transformers import Conversation
350
+
351
+ >>> Conversation(
352
+ ... "<<SYS>>\n Only answer with emojis, and charades\n<</SYS>>\n\nHow can I build a house in 10 septs?"
353
+ ... ) # doctest: +IGNORE_RESULT
354
+ ```
355
+ Args:
356
+ conversation (`Conversation`):
357
+ Conversation to build input ids for.
358
+ Returns:
359
+ `List[int]`:
360
+ Input ids for the conversation.
361
+ """
362
+ if len(conversation.past_user_inputs) > 0:
363
+ if not conversation.past_user_inputs[0].startswith(B_SYS) or E_SYS not in conversation.past_user_inputs[0]:
364
+ conversation.past_user_inputs[0] = (
365
+ B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.past_user_inputs[0]
366
+ )
367
+ elif conversation.new_user_input:
368
+ if not conversation.new_user_input.startswith(B_SYS) or E_SYS not in conversation.new_user_input:
369
+ conversation.new_user_input = B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.new_user_input
370
+ else:
371
+ raise ValueError("Last message must be from user")
372
+
373
+ dialogue = list(conversation.iter_texts())
374
+ if not all([is_user for is_user, msg in dialogue[::2]]) or not all(
375
+ [not is_user for is_user, msg in dialogue[1::2]]
376
+ ):
377
+ raise ValueError(
378
+ "The model only supports 'user' and 'assistant' roles, starting with user and alternating (u/a/u/a/u...)"
379
+ )
380
+
381
+ dialog_tokens: List[int] = []
382
+ dialog_tokens += sum(
383
+ [
384
+ [self.bos_token_id]
385
+ + self.encode(
386
+ f"{B_INST} {(prompt[1]).strip()} {E_INST} {(answer[1]).strip()} ", add_special_tokens=False
387
+ )
388
+ + [self.eos_token_id]
389
+ for prompt, answer in zip(dialogue[::2], dialogue[1::2])
390
+ ],
391
+ [],
392
+ )
393
+ dialog_tokens += [self.bos_token_id] + self.encode(
394
+ f"{B_INST} {(dialogue[-1][1]).strip()} {E_INST}", add_special_tokens=False
395
+ )
396
+ return dialog_tokens
tokenizer_config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "bos_token": {
5
+ "__type": "AddedToken",
6
+ "content": "<s>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "clean_up_tokenization_spaces": false,
13
+ "eos_token": {
14
+ "__type": "AddedToken",
15
+ "content": "</s>",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false
20
+ },
21
+ "model_max_length": 2048,
22
+ "pad_token": null,
23
+ "sp_model_kwargs": {},
24
+ "tokenizer_class": "LlamaTokenizer",
25
+ "unk_token": {
26
+ "__type": "AddedToken",
27
+ "content": "<unk>",
28
+ "lstrip": false,
29
+ "normalized": true,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ }
33
+ }