fixed tokenizer for rwkv_6_v2.1

#2
by SupYumm - opened
rwkv_vocab_v20230424.txt DELETED
The diff for this file is too large to render. See raw diff
 
hf_rwkv_tokenizer.py → tokenization_rwkv5.py RENAMED
@@ -12,7 +12,7 @@
12
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
15
- """Tokenization classes for RWKV6."""
16
 
17
  import os
18
  import re
@@ -27,147 +27,107 @@ if TYPE_CHECKING:
27
 
28
  logger = logging.get_logger(__name__)
29
 
30
-
31
  VOCAB_FILES_NAMES = {
32
- "vocab_file": "rwkv_vocab_v20230424.txt",
 
 
 
 
 
33
  }
34
 
35
- class TRIE:
36
- __slots__ = tuple("ch,to,values,front".split(","))
37
- to: list
38
- values: set
39
-
40
- def __init__(self, front=None, ch=None):
41
- self.ch = ch
42
- self.to = [None for ch in range(256)]
43
- self.values = set()
44
- self.front = front
45
-
46
- def __repr__(self):
47
- fr = self
48
- ret = []
49
- while fr != None:
50
- if fr.ch != None:
51
- ret.append(fr.ch)
52
- fr = fr.front
53
- return "<TRIE %s %s>" % (ret[::-1], self.values)
54
-
55
- def add(self, key: bytes, idx: int = 0, val=None):
56
- if idx == len(key):
57
- if val is None:
58
- val = key
59
- self.values.add(val)
60
- return self
61
- ch = key[idx]
62
- if self.to[ch] is None:
63
- self.to[ch] = TRIE(front=self, ch=ch)
64
- return self.to[ch].add(key, idx=idx + 1, val=val)
65
-
66
- def find_longest(self, key: bytes, idx: int = 0):
67
- u: TRIE = self
68
- ch: int = key[idx]
69
-
70
- while u.to[ch] is not None:
71
- u = u.to[ch]
72
- idx += 1
73
- if u.values:
74
- ret = idx, u, u.values
75
- if idx == len(key):
76
- break
77
- ch = key[idx]
78
- return ret
79
-
80
-
81
- class RWKV_TOKENIZER:
82
- def __init__(self, file_name):
83
- self.idx2token = {}
84
- sorted = [] # must be already sorted
85
- with open(file_name, "r", encoding="utf-8") as f:
86
- lines = f.readlines()
87
- for l in lines:
88
- idx = int(l[: l.index(" ")])
89
- x = eval(l[l.index(" ") : l.rindex(" ")])
90
- x = x.encode("utf-8") if isinstance(x, str) else x
91
- assert isinstance(x, bytes)
92
-
93
- assert len(x) == int(l[l.rindex(" ") :])
94
- sorted += [x]
95
- self.idx2token[idx] = x
96
-
97
- self.token2idx = {}
98
- for k, v in self.idx2token.items():
99
- self.token2idx[v] = int(k)
100
-
101
- self.root = TRIE()
102
- for t, i in self.token2idx.items():
103
- _ = self.root.add(t, val=(t, i))
104
-
105
- def encodeBytes(self, src: bytes):
106
- idx: int = 0
107
- tokens = []
108
- while idx < len(src):
109
- _idx: int = idx
110
- idx, _, values = self.root.find_longest(src, idx)
111
- assert idx != _idx
112
- _, token = next(iter(values))
113
- tokens.append(token)
114
- return tokens
115
-
116
- def decodeBytes(self, tokens):
117
- return b"".join(map(lambda i: self.idx2token[i], tokens))
118
-
119
- def encode(self, src):
120
- if isinstance(src, str):
121
- return [self.encodeBytes(src.encode("utf-8"))]
122
- elif isinstance(src, list):
123
- return [self.encodeBytes(s.encode("utf-8")) for s in src]
124
-
125
- def decode(self, tokens):
126
- return [self.decodeBytes(batch).decode("utf-8") for batch in tokens]
127
- # try:
128
- # return self.decodeBytes(tokens).decode('utf-8')
129
- # except:
130
- # return '\ufffd' # bad utf-8
131
-
132
- def printTokens(self, tokens):
133
- for i in tokens:
134
- s = self.idx2token[i]
135
- try:
136
- s = s.decode("utf-8")
137
- except:
138
- pass
139
- print(f"{repr(s)}{i}", end=" ")
140
- print()
141
-
142
-
143
- class Rwkv6Tokenizer(PreTrainedTokenizer):
144
  vocab_files_names = VOCAB_FILES_NAMES
 
 
 
145
  model_input_names = ["input_ids", "attention_mask"]
146
 
147
- def __init__(
148
- self, vocab_file, bos_token="<s>", eos_token="<s>", unk_token="<s>", **kwargs
149
- ):
150
  if not os.path.isfile(vocab_file):
151
  raise ValueError(
152
  f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
153
  " model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
154
  )
155
 
156
- with open(vocab_file, "r", encoding="utf-8") as reader:
157
  tokens = reader.readlines()
 
 
 
 
158
 
159
- if "add_bos_token" in kwargs:
160
- self.add_bos_token = kwargs["add_bos_token"]
161
- else:
162
- self.add_bos_token = False
163
- self.trie_tokenizer = RWKV_TOKENIZER(vocab_file)
164
- vocab = self.trie_tokenizer.token2idx
165
  self.encoder = vocab
166
  self.decoder = {v: k for k, v in vocab.items()}
 
167
  self._added_tokens_decoder = {0: AddedToken(str(bos_token))}
168
- super().__init__(
169
- bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs
170
- )
171
 
172
  @property
173
  def vocab_size(self):
@@ -179,11 +139,15 @@ class Rwkv6Tokenizer(PreTrainedTokenizer):
179
  return vocab
180
 
181
  def _tokenize(self, text, split_special_tokens=False):
182
- # return self.wordpiece_tokenizer.tokenize(text.encode("utf-8"))
183
- return self.trie_tokenizer.encode(text)[0]
184
 
185
  def _convert_token_to_id(self, token):
186
- return token
 
 
 
 
 
187
 
188
  def _convert_id_to_token(self, index):
189
  """Converts an index (integer) in a token (byte) using the vocab."""
@@ -194,28 +158,21 @@ class Rwkv6Tokenizer(PreTrainedTokenizer):
194
 
195
  def convert_tokens_to_string(self, tokens):
196
  """Converts a sequence of tokens (bytes) in a single string. Additional tokens are encoded to bytes"""
197
- out_string = b"".join(
198
- [k.encode(errors="replace") if isinstance(k, str) else k for k in tokens]
199
- ).decode("utf-8")
200
  return out_string
201
 
202
- def save_vocabulary(
203
- self, save_directory: str, filename_prefix: Optional[str] = None
204
- ) -> Tuple[str]:
205
  index = 0
206
  if os.path.isdir(save_directory):
207
  vocab_file = os.path.join(
208
- save_directory,
209
- (filename_prefix + "-" if filename_prefix else "") + "vocab.txt",
210
  )
211
  else:
212
- vocab_file = (
213
- filename_prefix + "-" if filename_prefix else ""
214
- ) + save_directory
215
- with open(vocab_file, "w", encoding="utf-8") as writer:
216
- for token, token_index in sorted(
217
- self.encoder.items(), key=lambda kv: kv[1]
218
- ):
219
  if index != token_index:
220
  logger.warning(
221
  f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
@@ -240,10 +197,7 @@ class Rwkv6Tokenizer(PreTrainedTokenizer):
240
  return output + bos_token_ids + token_ids_1
241
 
242
  def get_special_tokens_mask(
243
- self,
244
- token_ids_0: List[int],
245
- token_ids_1: Optional[List[int]] = None,
246
- already_has_special_tokens: bool = False,
247
  ) -> List[int]:
248
  """
249
  Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
@@ -262,16 +216,12 @@ class Rwkv6Tokenizer(PreTrainedTokenizer):
262
  """
263
  if already_has_special_tokens:
264
  return super().get_special_tokens_mask(
265
- token_ids_0=token_ids_0,
266
- token_ids_1=token_ids_1,
267
- already_has_special_tokens=True,
268
  )
269
 
270
  if not self.add_bos_token:
271
  return super().get_special_tokens_mask(
272
- token_ids_0=token_ids_0,
273
- token_ids_1=token_ids_1,
274
- already_has_special_tokens=False,
275
  )
276
 
277
  if token_ids_1 is None:
 
12
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
15
+ """Tokenization classes for RWKV5."""
16
 
17
  import os
18
  import re
 
27
 
28
  logger = logging.get_logger(__name__)
29
 
 
30
  VOCAB_FILES_NAMES = {
31
+ "vocab_file": "vocab.txt",
32
+ }
33
+ PRETRAINED_VOCAB_FILES_MAP = {
34
+ "vocab_file": {
35
+ "ArthurZ/rwkv-5-utf": "https://huggingface.co/ArthurZ/rwkv-5-utf/blob/main/vocab.txt",
36
+ },
37
  }
38
 
39
+
40
+ def whitespace_tokenize(text):
41
+ """Runs basic whitespace cleaning and splitting on a piece of text.
42
+ The separators are kept
43
+ """
44
+ text = text.strip()
45
+ if not text:
46
+ return []
47
+ tokens = re.split(b"(?= )", text)
48
+ return tokens
49
+
50
+
51
+ class WordpieceTokenizer(object):
52
+ """Runs WordPiece tokenization."""
53
+
54
+ def __init__(self, vocab, unk_token):
55
+ self.vocab = vocab
56
+ self.unk_token = unk_token
57
+
58
+ def tokenize(self, text):
59
+ """
60
+ Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
61
+ tokenization using the given vocabulary.
62
+
63
+ For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
64
+
65
+ Args:
66
+ text: A single token or whitespace separated tokens. This should have
67
+ already been passed through *BasicTokenizer*.
68
+
69
+ Returns:
70
+ A list of wordpiece tokens.
71
+ """
72
+
73
+ output_tokens = []
74
+ for token in whitespace_tokenize(text):
75
+ chars = list(token)
76
+ is_bad = False
77
+ start = 0
78
+ sub_tokens = []
79
+ while start < len(chars):
80
+ end = len(chars)
81
+ cur_substr = None
82
+ while start < end:
83
+ substr = bytes(chars[start:end])
84
+ if substr in self.vocab:
85
+ cur_substr = substr
86
+ break
87
+ end -= 1
88
+ if cur_substr is None:
89
+ is_bad = True
90
+ break
91
+ try:
92
+ cur_substr = cur_substr.decode()
93
+ except UnicodeDecodeError:
94
+ cur_substr = str(cur_substr)
95
+ sub_tokens.append(cur_substr)
96
+ start = end
97
+ if is_bad:
98
+ output_tokens.append(self.unk_token)
99
+ else:
100
+ output_tokens.extend(sub_tokens)
101
+ return output_tokens
102
+
103
+
104
+ class Rwkv5Tokenizer(PreTrainedTokenizer):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  vocab_files_names = VOCAB_FILES_NAMES
106
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
107
+ max_model_input_sizes = {"ArthurZ/rwkv-5-utf": 2048}
108
+
109
  model_input_names = ["input_ids", "attention_mask"]
110
 
111
+ def __init__(self, vocab_file, bos_token="<s>", eos_token="<s>", unk_token="<s>", **kwargs):
 
 
112
  if not os.path.isfile(vocab_file):
113
  raise ValueError(
114
  f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
115
  " model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
116
  )
117
 
118
+ with open(vocab_file, "r") as reader:
119
  tokens = reader.readlines()
120
+ vocab = {}
121
+ for index, token in enumerate(tokens):
122
+ token = eval(token.rstrip("\n"))
123
+ vocab[token] = index
124
 
125
+ self.add_bos_token = True
 
 
 
 
 
126
  self.encoder = vocab
127
  self.decoder = {v: k for k, v in vocab.items()}
128
+ self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.encoder, unk_token=str(unk_token))
129
  self._added_tokens_decoder = {0: AddedToken(str(bos_token))}
130
+ super().__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs)
 
 
131
 
132
  @property
133
  def vocab_size(self):
 
139
  return vocab
140
 
141
  def _tokenize(self, text, split_special_tokens=False):
142
+ return self.wordpiece_tokenizer.tokenize(text.encode("utf-8"))
 
143
 
144
  def _convert_token_to_id(self, token):
145
+ """Converts a token (byte) to an id using the vocab."""
146
+ if token.startswith("b'\\"):
147
+ token = eval(token)
148
+ elif not isinstance(token, bytes):
149
+ token = token.encode("utf-8", errors="replace")
150
+ return self.encoder.get(token, self.unk_token_id)
151
 
152
  def _convert_id_to_token(self, index):
153
  """Converts an index (integer) in a token (byte) using the vocab."""
 
158
 
159
  def convert_tokens_to_string(self, tokens):
160
  """Converts a sequence of tokens (bytes) in a single string. Additional tokens are encoded to bytes"""
161
+ out_string = b"".join([k.encode(errors="replace") if isinstance(k, str) else k for k in tokens]).decode(
162
+ "utf-8"
163
+ )
164
  return out_string
165
 
166
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
 
 
167
  index = 0
168
  if os.path.isdir(save_directory):
169
  vocab_file = os.path.join(
170
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
 
171
  )
172
  else:
173
+ vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
174
+ with open(vocab_file, "w") as writer:
175
+ for token, token_index in sorted(self.encoder.items(), key=lambda kv: kv[1]):
 
 
 
 
176
  if index != token_index:
177
  logger.warning(
178
  f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
 
197
  return output + bos_token_ids + token_ids_1
198
 
199
  def get_special_tokens_mask(
200
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
 
 
 
201
  ) -> List[int]:
202
  """
203
  Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
 
216
  """
217
  if already_has_special_tokens:
218
  return super().get_special_tokens_mask(
219
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
 
 
220
  )
221
 
222
  if not self.add_bos_token:
223
  return super().get_special_tokens_mask(
224
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=False
 
 
225
  )
226
 
227
  if token_ids_1 is None:
tokenizer_config.json CHANGED
@@ -1,11 +1,11 @@
1
  {
2
- "name_or_path": "rwkv-6-tokenizer",
3
  "add_prefix_space": false,
4
- "tokenizer_class": "Rwkv6Tokenizer",
5
  "use_fast": false,
6
  "auto_map": {
7
  "AutoTokenizer": [
8
- "hf_rwkv_tokenizer.Rwkv6Tokenizer",
9
  null
10
  ]
11
  }
 
1
  {
2
+ "name_or_path": "rwkv-5-tokenizer",
3
  "add_prefix_space": false,
4
+ "tokenizer_class": "Rwkv5Tokenizer",
5
  "use_fast": false,
6
  "auto_map": {
7
  "AutoTokenizer": [
8
+ "tokenization_rwkv5.Rwkv5Tokenizer",
9
  null
10
  ]
11
  }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff