oweller2
commited on
Commit
•
8a083e2
1
Parent(s):
f64965c
update
Browse files- tokenizer.py +4 -3
tokenizer.py
CHANGED
@@ -2,10 +2,11 @@ from transformers import PreTrainedTokenizerFast
|
|
2 |
|
3 |
class ModernDecoderBERTTokenizer(PreTrainedTokenizerFast):
|
4 |
|
5 |
-
def
|
6 |
breakpoint()
|
7 |
-
|
8 |
-
|
|
|
9 |
|
10 |
# Register the class
|
11 |
from transformers import AutoTokenizer
|
|
|
2 |
|
3 |
class ModernDecoderBERTTokenizer(PreTrainedTokenizerFast):
|
4 |
|
5 |
+
def _batch_encode_plus(self, *args, **kwargs):
|
6 |
breakpoint()
|
7 |
+
outputs = super()._batch_encode_plus(*args, **kwargs)
|
8 |
+
outputs['input_ids'] = [[id for id in ids if id != self.eos_token_id] for ids in outputs['input_ids']]
|
9 |
+
return outputs
|
10 |
|
11 |
# Register the class
|
12 |
from transformers import AutoTokenizer
|