feat(tokenizer): expose merge ranks and special tokens for GGUF
Browse files
tokenization_arcade100k.py
CHANGED
@@ -113,7 +113,7 @@ class Arcade100kTokenizer(PreTrainedTokenizer):
|
|
113 |
super().__init__(errors=errors, **kwargs)
|
114 |
self._tiktoken_config = _arcade100k(vocab_file)
|
115 |
self.tokenizer = tiktoken.Encoding(**self._tiktoken_config)
|
116 |
-
|
117 |
# TODO: Remove this assertion
|
118 |
assert (
|
119 |
len(self.tokenizer._mergeable_ranks)
|
@@ -126,6 +126,9 @@ class Arcade100kTokenizer(PreTrainedTokenizer):
|
|
126 |
self.decoder.update({i: n for n, i in self.tokenizer._special_tokens.items()})
|
127 |
self.eos_token = self.decoder[self.tokenizer.eot_token]
|
128 |
self.pad_token = self.decoder[self.tokenizer.eot_token]
|
|
|
|
|
|
|
129 |
|
130 |
def __len__(self):
|
131 |
return self.tokenizer.n_vocab
|
@@ -270,4 +273,4 @@ class Arcade100kTokenizer(PreTrainedTokenizer):
|
|
270 |
token_ids = [token_ids]
|
271 |
if skip_special_tokens:
|
272 |
token_ids = [i for i in token_ids if i < self.tokenizer.eot_token]
|
273 |
-
return self.tokenizer.decode(token_ids)
|
|
|
113 |
super().__init__(errors=errors, **kwargs)
|
114 |
self._tiktoken_config = _arcade100k(vocab_file)
|
115 |
self.tokenizer = tiktoken.Encoding(**self._tiktoken_config)
|
116 |
+
|
117 |
# TODO: Remove this assertion
|
118 |
assert (
|
119 |
len(self.tokenizer._mergeable_ranks)
|
|
|
126 |
self.decoder.update({i: n for n, i in self.tokenizer._special_tokens.items()})
|
127 |
self.eos_token = self.decoder[self.tokenizer.eot_token]
|
128 |
self.pad_token = self.decoder[self.tokenizer.eot_token]
|
129 |
+
# Expose for convenience
|
130 |
+
self.mergeable_ranks = self.tokenizer._mergeable_ranks
|
131 |
+
self.special_tokens = self.tokenizer._special_tokens
|
132 |
|
133 |
def __len__(self):
|
134 |
return self.tokenizer.n_vocab
|
|
|
273 |
token_ids = [token_ids]
|
274 |
if skip_special_tokens:
|
275 |
token_ids = [i for i in token_ids if i < self.tokenizer.eot_token]
|
276 |
+
return self.tokenizer.decode(token_ids)
|