oweller2
commited on
Commit
•
6d20d8a
1
Parent(s):
3608e05
update
Browse files- tokenizer.py +2 -13
tokenizer.py
CHANGED
@@ -1,18 +1,7 @@
|
|
1 |
-
from transformers import
|
2 |
-
|
3 |
-
class ModernDecoderBERTTokenizer(PreTrainedTokenizer):
|
4 |
-
|
5 |
-
def __init__(self, *args, **kwargs):
|
6 |
-
super().__init__(*args, **kwargs)
|
7 |
|
|
|
8 |
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
|
9 |
-
breakpoint()
|
10 |
if token_ids_1 is None:
|
11 |
return [id for id in token_ids_0 if id != self.eos_token_id]
|
12 |
return [id for id in token_ids_0 if id != self.eos_token_id] + [id for id in token_ids_1 if id != self.eos_token_id]
|
13 |
-
|
14 |
-
def get_vocab(self):
|
15 |
-
breakpoint()
|
16 |
-
return dict(self.vocab.items())
|
17 |
-
|
18 |
-
AutoTokenizer.register("ModernDecoderBERTTokenizer", ModernDecoderBERTTokenizer)
|
|
|
1 |
+
from transformers import PreTrainedTokenizerFast
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
+
class ModernDecoderBERTTokenizer(PreTrainedTokenizerFast):
|
4 |
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
|
|
|
5 |
if token_ids_1 is None:
|
6 |
return [id for id in token_ids_0 if id != self.eos_token_id]
|
7 |
return [id for id in token_ids_0 if id != self.eos_token_id] + [id for id in token_ids_1 if id != self.eos_token_id]
|
|
|
|
|
|
|
|
|
|
|
|