fixed tokenizer for rwkv_6_v2.1

by SupYumm - opened Jun 21, 2024

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+65762

-65811

This PR is in draft mode

Files changed (4) hide show

rwkv_vocab_v20230424.txt +0 -0
hf_rwkv_tokenizer.py → tokenization_rwkv5.py +103 -153
tokenizer_config.json +3 -3
vocab.txt +0 -0

rwkv_vocab_v20230424.txt DELETED Viewed

The diff for this file is too large to render. See raw diff

hf_rwkv_tokenizer.py → tokenization_rwkv5.py RENAMED Viewed

@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Tokenization classes for RWKV6."""
 import os
 import re
@@ -27,147 +27,107 @@ if TYPE_CHECKING:
 logger = logging.get_logger(__name__)
 VOCAB_FILES_NAMES = {
-    "vocab_file": "rwkv_vocab_v20230424.txt",
 }
-class TRIE:
-    __slots__ = tuple("ch,to,values,front".split(","))
-    to: list
-    values: set
-    def __init__(self, front=None, ch=None):
-        self.ch = ch
-        self.to = [None for ch in range(256)]
-        self.values = set()
-        self.front = front
-    def __repr__(self):
-        fr = self
-        ret = []
-        while fr != None:
-            if fr.ch != None:
-                ret.append(fr.ch)
-            fr = fr.front
-        return "<TRIE %s %s>" % (ret[::-1], self.values)
-    def add(self, key: bytes, idx: int = 0, val=None):
-        if idx == len(key):
-            if val is None:
-                val = key
-            self.values.add(val)
-            return self
-        ch = key[idx]
-        if self.to[ch] is None:
-            self.to[ch] = TRIE(front=self, ch=ch)
-        return self.to[ch].add(key, idx=idx + 1, val=val)
-    def find_longest(self, key: bytes, idx: int = 0):
-        u: TRIE = self
-        ch: int = key[idx]
-        while u.to[ch] is not None:
-            u = u.to[ch]
-            idx += 1
-            if u.values:
-                ret = idx, u, u.values
-            if idx == len(key):
-                break
-            ch = key[idx]
-        return ret
-class RWKV_TOKENIZER:
-    def __init__(self, file_name):
-        self.idx2token = {}
-        sorted = []  # must be already sorted
-        with open(file_name, "r", encoding="utf-8") as f:
-            lines = f.readlines()
-        for l in lines:
-            idx = int(l[: l.index(" ")])
-            x = eval(l[l.index(" ") : l.rindex(" ")])
-            x = x.encode("utf-8") if isinstance(x, str) else x
-            assert isinstance(x, bytes)
-            assert len(x) == int(l[l.rindex(" ") :])
-            sorted += [x]
-            self.idx2token[idx] = x
-        self.token2idx = {}
-        for k, v in self.idx2token.items():
-            self.token2idx[v] = int(k)
-        self.root = TRIE()
-        for t, i in self.token2idx.items():
-            _ = self.root.add(t, val=(t, i))
-    def encodeBytes(self, src: bytes):
-        idx: int = 0
-        tokens = []
-        while idx < len(src):
-            _idx: int = idx
-            idx, _, values = self.root.find_longest(src, idx)
-            assert idx != _idx
-            _, token = next(iter(values))
-            tokens.append(token)
-        return tokens
-    def decodeBytes(self, tokens):
-        return b"".join(map(lambda i: self.idx2token[i], tokens))
-    def encode(self, src):
-        if isinstance(src, str):
-            return [self.encodeBytes(src.encode("utf-8"))]
-        elif isinstance(src, list):
-            return [self.encodeBytes(s.encode("utf-8")) for s in src]
-    def decode(self, tokens):
-        return [self.decodeBytes(batch).decode("utf-8") for batch in tokens]
-        # try:
-        #     return self.decodeBytes(tokens).decode('utf-8')
-        # except:
-        #     return '\ufffd' # bad utf-8
-    def printTokens(self, tokens):
-        for i in tokens:
-            s = self.idx2token[i]
-            try:
-                s = s.decode("utf-8")
-            except:
-                pass
-            print(f"{repr(s)}{i}", end=" ")
-        print()
-class Rwkv6Tokenizer(PreTrainedTokenizer):
     vocab_files_names = VOCAB_FILES_NAMES
     model_input_names = ["input_ids", "attention_mask"]
-    def __init__(
-        self, vocab_file, bos_token="<s>", eos_token="<s>", unk_token="<s>", **kwargs
-    ):
         if not os.path.isfile(vocab_file):
             raise ValueError(
                 f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
                 " model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
             )
-        with open(vocab_file, "r", encoding="utf-8") as reader:
             tokens = reader.readlines()
-        if "add_bos_token" in kwargs:
-            self.add_bos_token = kwargs["add_bos_token"]
-        else:
-            self.add_bos_token = False
-        self.trie_tokenizer = RWKV_TOKENIZER(vocab_file)
-        vocab = self.trie_tokenizer.token2idx
         self.encoder = vocab
         self.decoder = {v: k for k, v in vocab.items()}
         self._added_tokens_decoder = {0: AddedToken(str(bos_token))}
-        super().__init__(
-            bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs
-        )
     @property
     def vocab_size(self):
@@ -179,11 +139,15 @@ class Rwkv6Tokenizer(PreTrainedTokenizer):
         return vocab
     def _tokenize(self, text, split_special_tokens=False):
-        # return self.wordpiece_tokenizer.tokenize(text.encode("utf-8"))
-        return self.trie_tokenizer.encode(text)[0]
     def _convert_token_to_id(self, token):
-        return token
     def _convert_id_to_token(self, index):
         """Converts an index (integer) in a token (byte) using the vocab."""
@@ -194,28 +158,21 @@ class Rwkv6Tokenizer(PreTrainedTokenizer):
     def convert_tokens_to_string(self, tokens):
         """Converts a sequence of tokens (bytes) in a single string. Additional tokens are encoded to bytes"""
-        out_string = b"".join(
-            [k.encode(errors="replace") if isinstance(k, str) else k for k in tokens]
-        ).decode("utf-8")
         return out_string
-    def save_vocabulary(
-        self, save_directory: str, filename_prefix: Optional[str] = None
-    ) -> Tuple[str]:
         index = 0
         if os.path.isdir(save_directory):
             vocab_file = os.path.join(
-                save_directory,
-                (filename_prefix + "-" if filename_prefix else "") + "vocab.txt",
             )
         else:
-            vocab_file = (
-                filename_prefix + "-" if filename_prefix else ""
-            ) + save_directory
-        with open(vocab_file, "w", encoding="utf-8") as writer:
-            for token, token_index in sorted(
-                self.encoder.items(), key=lambda kv: kv[1]
-            ):
                 if index != token_index:
                     logger.warning(
                         f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
@@ -240,10 +197,7 @@ class Rwkv6Tokenizer(PreTrainedTokenizer):
         return output + bos_token_ids + token_ids_1
     def get_special_tokens_mask(
-        self,
-        token_ids_0: List[int],
-        token_ids_1: Optional[List[int]] = None,
-        already_has_special_tokens: bool = False,
     ) -> List[int]:
         """
         Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
@@ -262,16 +216,12 @@ class Rwkv6Tokenizer(PreTrainedTokenizer):
         """
         if already_has_special_tokens:
             return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0,
-                token_ids_1=token_ids_1,
-                already_has_special_tokens=True,
             )
         if not self.add_bos_token:
             return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0,
-                token_ids_1=token_ids_1,
-                already_has_special_tokens=False,
             )
         if token_ids_1 is None:

 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""Tokenization classes for RWKV5."""
 import os
 import re
 logger = logging.get_logger(__name__)
 VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.txt",
+}
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "ArthurZ/rwkv-5-utf": "https://huggingface.co/ArthurZ/rwkv-5-utf/blob/main/vocab.txt",
+    },
 }
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a piece of text.
+    The separators are kept
+    """
+    text = text.strip()
+    if not text:
+        return []
+    tokens = re.split(b"(?= )", text)
+    return tokens
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenization."""
+    def __init__(self, vocab, unk_token):
+        self.vocab = vocab
+        self.unk_token = unk_token
+    def tokenize(self, text):
+        """
+        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
+        tokenization using the given vocabulary.
+        For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
+        Args:
+            text: A single token or whitespace separated tokens. This should have
+                already been passed through *BasicTokenizer*.
+        Returns:
+            A list of wordpiece tokens.
+        """
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = bytes(chars[start:end])
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                try:
+                    cur_substr = cur_substr.decode()
+                except UnicodeDecodeError:
+                    cur_substr = str(cur_substr)
+                sub_tokens.append(cur_substr)
+                start = end
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+class Rwkv5Tokenizer(PreTrainedTokenizer):
     vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = {"ArthurZ/rwkv-5-utf": 2048}
     model_input_names = ["input_ids", "attention_mask"]
+    def __init__(self, vocab_file, bos_token="<s>", eos_token="<s>", unk_token="<s>", **kwargs):
         if not os.path.isfile(vocab_file):
             raise ValueError(
                 f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
                 " model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
             )
+        with open(vocab_file, "r") as reader:
             tokens = reader.readlines()
+        vocab = {}
+        for index, token in enumerate(tokens):
+            token = eval(token.rstrip("\n"))
+            vocab[token] = index
+        self.add_bos_token = True
         self.encoder = vocab
         self.decoder = {v: k for k, v in vocab.items()}
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.encoder, unk_token=str(unk_token))
         self._added_tokens_decoder = {0: AddedToken(str(bos_token))}
+        super().__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs)
     @property
     def vocab_size(self):
         return vocab
     def _tokenize(self, text, split_special_tokens=False):
+        return self.wordpiece_tokenizer.tokenize(text.encode("utf-8"))
     def _convert_token_to_id(self, token):
+        """Converts a token (byte) to an id using the vocab."""
+        if token.startswith("b'\\"):
+            token = eval(token)
+        elif not isinstance(token, bytes):
+            token = token.encode("utf-8", errors="replace")
+        return self.encoder.get(token, self.unk_token_id)
     def _convert_id_to_token(self, index):
         """Converts an index (integer) in a token (byte) using the vocab."""
     def convert_tokens_to_string(self, tokens):
         """Converts a sequence of tokens (bytes) in a single string. Additional tokens are encoded to bytes"""
+        out_string = b"".join([k.encode(errors="replace") if isinstance(k, str) else k for k in tokens]).decode(
+            "utf-8"
+        )
         return out_string
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
         index = 0
         if os.path.isdir(save_directory):
             vocab_file = os.path.join(
+                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
             )
         else:
+            vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
+        with open(vocab_file, "w") as writer:
+            for token, token_index in sorted(self.encoder.items(), key=lambda kv: kv[1]):
                 if index != token_index:
                     logger.warning(
                         f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
         return output + bos_token_ids + token_ids_1
     def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
     ) -> List[int]:
         """
         Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
         """
         if already_has_special_tokens:
             return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
             )
         if not self.add_bos_token:
             return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=False
             )
         if token_ids_1 is None:

tokenizer_config.json CHANGED Viewed

@@ -1,11 +1,11 @@
 {
-  "name_or_path": "rwkv-6-tokenizer",
   "add_prefix_space": false,
-  "tokenizer_class": "Rwkv6Tokenizer",
   "use_fast": false,
   "auto_map": {
     "AutoTokenizer": [
-      "hf_rwkv_tokenizer.Rwkv6Tokenizer",
       null
       ]
   }

 {
+  "name_or_path": "rwkv-5-tokenizer",
   "add_prefix_space": false,
+  "tokenizer_class": "Rwkv5Tokenizer",
   "use_fast": false,
   "auto_map": {
     "AutoTokenizer": [
+      "tokenization_rwkv5.Rwkv5Tokenizer",
       null
       ]
   }

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff