oweller2
commited on
Commit
•
bfe22ad
1
Parent(s):
8250eed
override input_ids
Browse files- tokenizer.py +4 -45
- tokenizer_config.json +1 -1
tokenizer.py
CHANGED
@@ -4,48 +4,7 @@ class ModernDecoderBERTTokenizer(PreTrainedTokenizerFast):
|
|
4 |
def __init__(self, *args, **kwargs):
|
5 |
super().__init__(*args, **kwargs)
|
6 |
|
7 |
-
def
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
add_special_tokens=True,
|
12 |
-
padding=False,
|
13 |
-
truncation=False,
|
14 |
-
max_length=None,
|
15 |
-
stride=0,
|
16 |
-
pad_to_multiple_of=None,
|
17 |
-
return_tensors=None,
|
18 |
-
return_token_type_ids=None,
|
19 |
-
return_attention_mask=None,
|
20 |
-
return_overflowing_tokens=False,
|
21 |
-
return_special_tokens_mask=False,
|
22 |
-
return_offsets_mapping=False,
|
23 |
-
return_length=False,
|
24 |
-
verbose=True,
|
25 |
-
prepend_batch_axis=False,
|
26 |
-
**kwargs
|
27 |
-
):
|
28 |
-
breakpoint()
|
29 |
-
if add_special_tokens and self.eos_token_id in ids:
|
30 |
-
ids = [id for id in ids if id != self.eos_token_id]
|
31 |
-
|
32 |
-
return super().prepare_for_model(
|
33 |
-
ids,
|
34 |
-
pair_ids=pair_ids,
|
35 |
-
add_special_tokens=add_special_tokens,
|
36 |
-
padding=padding,
|
37 |
-
truncation=truncation,
|
38 |
-
max_length=max_length,
|
39 |
-
stride=stride,
|
40 |
-
pad_to_multiple_of=pad_to_multiple_of,
|
41 |
-
return_tensors=return_tensors,
|
42 |
-
return_token_type_ids=return_token_type_ids,
|
43 |
-
return_attention_mask=return_attention_mask,
|
44 |
-
return_overflowing_tokens=return_overflowing_tokens,
|
45 |
-
return_special_tokens_mask=return_special_tokens_mask,
|
46 |
-
return_offsets_mapping=return_offsets_mapping,
|
47 |
-
return_length=return_length,
|
48 |
-
verbose=verbose,
|
49 |
-
prepend_batch_axis=prepend_batch_axis,
|
50 |
-
**kwargs
|
51 |
-
)
|
|
|
4 |
def __init__(self, *args, **kwargs):
|
5 |
super().__init__(*args, **kwargs)
|
6 |
|
7 |
+
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
|
8 |
+
if token_ids_1 is None:
|
9 |
+
return [id for id in token_ids_0 if id != self.eos_token_id]
|
10 |
+
return [id for id in token_ids_0 if id != self.eos_token_id] + [id for id in token_ids_1 if id != self.eos_token_id]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tokenizer_config.json
CHANGED
@@ -937,7 +937,7 @@
|
|
937 |
"sep_token": "[SEP]",
|
938 |
"unk_token": "[UNK]",
|
939 |
"eos_token": "[SEP]",
|
940 |
-
"tokenizer_class": "
|
941 |
"truncation": "right",
|
942 |
"auto_map": {
|
943 |
"AutoConfig": "orionweller/test-flex-gpt--configuration_bert.FlexBertConfig",
|
|
|
937 |
"sep_token": "[SEP]",
|
938 |
"unk_token": "[UNK]",
|
939 |
"eos_token": "[SEP]",
|
940 |
+
"tokenizer_class": "ModernDecoderBERTTokenizer",
|
941 |
"truncation": "right",
|
942 |
"auto_map": {
|
943 |
"AutoConfig": "orionweller/test-flex-gpt--configuration_bert.FlexBertConfig",
|