oweller2 commited on
Commit
7b38d2c
·
1 Parent(s): 8a083e2
Files changed (1) hide show
  1. tokenizer.py +3 -2
tokenizer.py CHANGED
@@ -3,9 +3,10 @@ from transformers import PreTrainedTokenizerFast
3
  class ModernDecoderBERTTokenizer(PreTrainedTokenizerFast):
4
 
5
  def _batch_encode_plus(self, *args, **kwargs):
6
- breakpoint()
7
  outputs = super()._batch_encode_plus(*args, **kwargs)
8
- outputs['input_ids'] = [[id for id in ids if id != self.eos_token_id] for ids in outputs['input_ids']]
 
 
9
  return outputs
10
 
11
  # Register the class
 
3
  class ModernDecoderBERTTokenizer(PreTrainedTokenizerFast):
4
 
5
  def _batch_encode_plus(self, *args, **kwargs):
 
6
  outputs = super()._batch_encode_plus(*args, **kwargs)
7
+ del outputs["token_type_ids"]
8
+ for key in ['input_ids', 'attention_mask']:
9
+ outputs[key] = [sequence[:-1] for sequence in outputs[key]]
10
  return outputs
11
 
12
  # Register the class