oweller2 commited on
Commit
bfe22ad
1 Parent(s): 8250eed

override input_ids

Browse files
Files changed (2) hide show
  1. tokenizer.py +4 -45
  2. tokenizer_config.json +1 -1
tokenizer.py CHANGED
@@ -4,48 +4,7 @@ class ModernDecoderBERTTokenizer(PreTrainedTokenizerFast):
4
  def __init__(self, *args, **kwargs):
5
  super().__init__(*args, **kwargs)
6
 
7
- def prepare_for_model(
8
- self,
9
- ids,
10
- pair_ids=None,
11
- add_special_tokens=True,
12
- padding=False,
13
- truncation=False,
14
- max_length=None,
15
- stride=0,
16
- pad_to_multiple_of=None,
17
- return_tensors=None,
18
- return_token_type_ids=None,
19
- return_attention_mask=None,
20
- return_overflowing_tokens=False,
21
- return_special_tokens_mask=False,
22
- return_offsets_mapping=False,
23
- return_length=False,
24
- verbose=True,
25
- prepend_batch_axis=False,
26
- **kwargs
27
- ):
28
- breakpoint()
29
- if add_special_tokens and self.eos_token_id in ids:
30
- ids = [id for id in ids if id != self.eos_token_id]
31
-
32
- return super().prepare_for_model(
33
- ids,
34
- pair_ids=pair_ids,
35
- add_special_tokens=add_special_tokens,
36
- padding=padding,
37
- truncation=truncation,
38
- max_length=max_length,
39
- stride=stride,
40
- pad_to_multiple_of=pad_to_multiple_of,
41
- return_tensors=return_tensors,
42
- return_token_type_ids=return_token_type_ids,
43
- return_attention_mask=return_attention_mask,
44
- return_overflowing_tokens=return_overflowing_tokens,
45
- return_special_tokens_mask=return_special_tokens_mask,
46
- return_offsets_mapping=return_offsets_mapping,
47
- return_length=return_length,
48
- verbose=verbose,
49
- prepend_batch_axis=prepend_batch_axis,
50
- **kwargs
51
- )
 
4
  def __init__(self, *args, **kwargs):
5
  super().__init__(*args, **kwargs)
6
 
7
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
8
+ if token_ids_1 is None:
9
+ return [id for id in token_ids_0 if id != self.eos_token_id]
10
+ return [id for id in token_ids_0 if id != self.eos_token_id] + [id for id in token_ids_1 if id != self.eos_token_id]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tokenizer_config.json CHANGED
@@ -937,7 +937,7 @@
937
  "sep_token": "[SEP]",
938
  "unk_token": "[UNK]",
939
  "eos_token": "[SEP]",
940
- "tokenizer_class": "PreTrainedTokenizerFast",
941
  "truncation": "right",
942
  "auto_map": {
943
  "AutoConfig": "orionweller/test-flex-gpt--configuration_bert.FlexBertConfig",
 
937
  "sep_token": "[SEP]",
938
  "unk_token": "[UNK]",
939
  "eos_token": "[SEP]",
940
+ "tokenizer_class": "ModernDecoderBERTTokenizer",
941
  "truncation": "right",
942
  "auto_map": {
943
  "AutoConfig": "orionweller/test-flex-gpt--configuration_bert.FlexBertConfig",