oweller2 commited on
Commit
08d8d00
1 Parent(s): 6a03105
Files changed (2) hide show
  1. tokenizer.py +49 -0
  2. tokenizer_config.json +8 -2
tokenizer.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class ModernDecoderBERTTokenizer(PreTrainedTokenizerFast):
2
+ def __init__(self, *args, **kwargs):
3
+ super().__init__(*args, **kwargs)
4
+
5
+ def prepare_for_model(
6
+ self,
7
+ ids,
8
+ pair_ids=None,
9
+ add_special_tokens=True,
10
+ padding=False,
11
+ truncation=False,
12
+ max_length=None,
13
+ stride=0,
14
+ pad_to_multiple_of=None,
15
+ return_tensors=None,
16
+ return_token_type_ids=None,
17
+ return_attention_mask=None,
18
+ return_overflowing_tokens=False,
19
+ return_special_tokens_mask=False,
20
+ return_offsets_mapping=False,
21
+ return_length=False,
22
+ verbose=True,
23
+ prepend_batch_axis=False,
24
+ **kwargs
25
+ ):
26
+ breakpoint()
27
+ if add_special_tokens and self.eos_token_id in ids:
28
+ ids = [id for id in ids if id != self.eos_token_id]
29
+
30
+ return super().prepare_for_model(
31
+ ids,
32
+ pair_ids=pair_ids,
33
+ add_special_tokens=add_special_tokens,
34
+ padding=padding,
35
+ truncation=truncation,
36
+ max_length=max_length,
37
+ stride=stride,
38
+ pad_to_multiple_of=pad_to_multiple_of,
39
+ return_tensors=return_tensors,
40
+ return_token_type_ids=return_token_type_ids,
41
+ return_attention_mask=return_attention_mask,
42
+ return_overflowing_tokens=return_overflowing_tokens,
43
+ return_special_tokens_mask=return_special_tokens_mask,
44
+ return_offsets_mapping=return_offsets_mapping,
45
+ return_length=return_length,
46
+ verbose=verbose,
47
+ prepend_batch_axis=prepend_batch_axis,
48
+ **kwargs
49
+ )
tokenizer_config.json CHANGED
@@ -935,7 +935,13 @@
935
  "model_max_length": 1000000000000000019884624838656,
936
  "pad_token": "[PAD]",
937
  "sep_token": "[SEP]",
938
- "tokenizer_class": "GPT2Tokenizer",
939
  "unk_token": "[UNK]",
940
- "eos_token": "[SEP]"
 
 
 
 
 
 
 
941
  }
 
935
  "model_max_length": 1000000000000000019884624838656,
936
  "pad_token": "[PAD]",
937
  "sep_token": "[SEP]",
 
938
  "unk_token": "[UNK]",
939
+ "eos_token": "[SEP]",
940
+ "tokenizer_class": "PreTrainedTokenizerFast",
941
+ "truncation": "right",
942
+ "auto_map": {
943
+ "AutoConfig": "orionweller/test-flex-gpt--configuration_bert.FlexBertConfig",
944
+ "AutoTokenizer": [
945
+ "orionweller/test-flex-gpt--tokenizer.ModernDecoderBERTTokenizer"
946
+ ]
947
  }