oweller2
commited on
Commit
•
08d8d00
1
Parent(s):
6a03105
tokenizer
Browse files- tokenizer.py +49 -0
- tokenizer_config.json +8 -2
tokenizer.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
class ModernDecoderBERTTokenizer(PreTrainedTokenizerFast):
|
2 |
+
def __init__(self, *args, **kwargs):
|
3 |
+
super().__init__(*args, **kwargs)
|
4 |
+
|
5 |
+
def prepare_for_model(
|
6 |
+
self,
|
7 |
+
ids,
|
8 |
+
pair_ids=None,
|
9 |
+
add_special_tokens=True,
|
10 |
+
padding=False,
|
11 |
+
truncation=False,
|
12 |
+
max_length=None,
|
13 |
+
stride=0,
|
14 |
+
pad_to_multiple_of=None,
|
15 |
+
return_tensors=None,
|
16 |
+
return_token_type_ids=None,
|
17 |
+
return_attention_mask=None,
|
18 |
+
return_overflowing_tokens=False,
|
19 |
+
return_special_tokens_mask=False,
|
20 |
+
return_offsets_mapping=False,
|
21 |
+
return_length=False,
|
22 |
+
verbose=True,
|
23 |
+
prepend_batch_axis=False,
|
24 |
+
**kwargs
|
25 |
+
):
|
26 |
+
breakpoint()
|
27 |
+
if add_special_tokens and self.eos_token_id in ids:
|
28 |
+
ids = [id for id in ids if id != self.eos_token_id]
|
29 |
+
|
30 |
+
return super().prepare_for_model(
|
31 |
+
ids,
|
32 |
+
pair_ids=pair_ids,
|
33 |
+
add_special_tokens=add_special_tokens,
|
34 |
+
padding=padding,
|
35 |
+
truncation=truncation,
|
36 |
+
max_length=max_length,
|
37 |
+
stride=stride,
|
38 |
+
pad_to_multiple_of=pad_to_multiple_of,
|
39 |
+
return_tensors=return_tensors,
|
40 |
+
return_token_type_ids=return_token_type_ids,
|
41 |
+
return_attention_mask=return_attention_mask,
|
42 |
+
return_overflowing_tokens=return_overflowing_tokens,
|
43 |
+
return_special_tokens_mask=return_special_tokens_mask,
|
44 |
+
return_offsets_mapping=return_offsets_mapping,
|
45 |
+
return_length=return_length,
|
46 |
+
verbose=verbose,
|
47 |
+
prepend_batch_axis=prepend_batch_axis,
|
48 |
+
**kwargs
|
49 |
+
)
|
tokenizer_config.json
CHANGED
@@ -935,7 +935,13 @@
|
|
935 |
"model_max_length": 1000000000000000019884624838656,
|
936 |
"pad_token": "[PAD]",
|
937 |
"sep_token": "[SEP]",
|
938 |
-
"tokenizer_class": "GPT2Tokenizer",
|
939 |
"unk_token": "[UNK]",
|
940 |
-
"eos_token": "[SEP]"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
941 |
}
|
|
|
935 |
"model_max_length": 1000000000000000019884624838656,
|
936 |
"pad_token": "[PAD]",
|
937 |
"sep_token": "[SEP]",
|
|
|
938 |
"unk_token": "[UNK]",
|
939 |
+
"eos_token": "[SEP]",
|
940 |
+
"tokenizer_class": "PreTrainedTokenizerFast",
|
941 |
+
"truncation": "right",
|
942 |
+
"auto_map": {
|
943 |
+
"AutoConfig": "orionweller/test-flex-gpt--configuration_bert.FlexBertConfig",
|
944 |
+
"AutoTokenizer": [
|
945 |
+
"orionweller/test-flex-gpt--tokenizer.ModernDecoderBERTTokenizer"
|
946 |
+
]
|
947 |
}
|