New version
Browse files- char_tokenizer.py +17 -6
- config.json +11 -11
- pytorch_model.bin +2 -2
- special_tokens_map.json +4 -4
- tokenizer_config.json +6 -8
- vocab.txt +4 -34
char_tokenizer.py
CHANGED
@@ -3,7 +3,7 @@ from typing import Optional, Tuple, List
|
|
3 |
from collections import OrderedDict
|
4 |
|
5 |
from torch.utils.data import Dataset
|
6 |
-
from transformers import PreTrainedTokenizer
|
7 |
|
8 |
|
9 |
def load_vocab(vocab_file):
|
@@ -22,10 +22,11 @@ class CharTokenizer(PreTrainedTokenizer):
|
|
22 |
def __init__(
|
23 |
self,
|
24 |
vocab_file=None,
|
25 |
-
pad_token="[
|
26 |
-
unk_token="[
|
27 |
-
bos_token="[
|
28 |
-
eos_token="[
|
|
|
29 |
*args,
|
30 |
**kwargs
|
31 |
):
|
@@ -34,8 +35,10 @@ class CharTokenizer(PreTrainedTokenizer):
|
|
34 |
unk_token=unk_token,
|
35 |
bos_token=bos_token,
|
36 |
eos_token=eos_token,
|
|
|
37 |
**kwargs
|
38 |
)
|
|
|
39 |
|
40 |
if not vocab_file or not os.path.isfile(vocab_file):
|
41 |
self.vocab = OrderedDict()
|
@@ -49,6 +52,8 @@ class CharTokenizer(PreTrainedTokenizer):
|
|
49 |
with open(file_path) as r:
|
50 |
for line in r:
|
51 |
word = line.strip()
|
|
|
|
|
52 |
vocab |= set(word)
|
53 |
vocab = list(vocab)
|
54 |
vocab.sort()
|
@@ -67,12 +72,16 @@ class CharTokenizer(PreTrainedTokenizer):
|
|
67 |
return self.vocab
|
68 |
|
69 |
def _convert_token_to_id(self, token):
|
70 |
-
|
|
|
|
|
71 |
|
72 |
def _convert_id_to_token(self, index):
|
73 |
return self.ids_to_tokens[index]
|
74 |
|
75 |
def _tokenize(self, text):
|
|
|
|
|
76 |
return list(text)
|
77 |
|
78 |
def convert_tokens_to_string(self, tokens):
|
@@ -119,3 +128,5 @@ class CharTokenizer(PreTrainedTokenizer):
|
|
119 |
writer.write(token + "\n")
|
120 |
index += 1
|
121 |
return (vocab_file,)
|
|
|
|
|
|
3 |
from collections import OrderedDict
|
4 |
|
5 |
from torch.utils.data import Dataset
|
6 |
+
from transformers import PreTrainedTokenizer, AutoTokenizer
|
7 |
|
8 |
|
9 |
def load_vocab(vocab_file):
|
|
|
22 |
def __init__(
|
23 |
self,
|
24 |
vocab_file=None,
|
25 |
+
pad_token="[pad]",
|
26 |
+
unk_token="[unk]",
|
27 |
+
bos_token="[bos]",
|
28 |
+
eos_token="[eos]",
|
29 |
+
do_lower_case=False,
|
30 |
*args,
|
31 |
**kwargs
|
32 |
):
|
|
|
35 |
unk_token=unk_token,
|
36 |
bos_token=bos_token,
|
37 |
eos_token=eos_token,
|
38 |
+
do_lower_case=do_lower_case,
|
39 |
**kwargs
|
40 |
)
|
41 |
+
self.do_lower_case = do_lower_case
|
42 |
|
43 |
if not vocab_file or not os.path.isfile(vocab_file):
|
44 |
self.vocab = OrderedDict()
|
|
|
52 |
with open(file_path) as r:
|
53 |
for line in r:
|
54 |
word = line.strip()
|
55 |
+
if self.do_lower_case:
|
56 |
+
word = word.lower()
|
57 |
vocab |= set(word)
|
58 |
vocab = list(vocab)
|
59 |
vocab.sort()
|
|
|
72 |
return self.vocab
|
73 |
|
74 |
def _convert_token_to_id(self, token):
|
75 |
+
if self.do_lower_case:
|
76 |
+
token = token.lower()
|
77 |
+
return self.vocab.get(token, self.vocab[self.unk_token])
|
78 |
|
79 |
def _convert_id_to_token(self, index):
|
80 |
return self.ids_to_tokens[index]
|
81 |
|
82 |
def _tokenize(self, text):
|
83 |
+
if self.do_lower_case:
|
84 |
+
text = text.lower()
|
85 |
return list(text)
|
86 |
|
87 |
def convert_tokens_to_string(self, tokens):
|
|
|
128 |
writer.write(token + "\n")
|
129 |
index += 1
|
130 |
return (vocab_file,)
|
131 |
+
|
132 |
+
AutoTokenizer.register("char_tokenizer", CharTokenizer)
|
config.json
CHANGED
@@ -2,17 +2,17 @@
|
|
2 |
"architectures": [
|
3 |
"DebertaV2ForTokenClassification"
|
4 |
],
|
5 |
-
"attention_probs_dropout_prob": 0.
|
6 |
"hidden_act": "gelu",
|
7 |
-
"hidden_dropout_prob": 0.
|
8 |
-
"hidden_size":
|
9 |
"id2label": {
|
10 |
"0": "NO",
|
11 |
"1": "PRIMARY",
|
12 |
"2": "SECONDARY"
|
13 |
},
|
14 |
"initializer_range": 0.02,
|
15 |
-
"intermediate_size":
|
16 |
"label2id": {
|
17 |
"NO": 0,
|
18 |
"PRIMARY": 1,
|
@@ -20,20 +20,20 @@
|
|
20 |
},
|
21 |
"layer_norm_eps": 1e-07,
|
22 |
"max_length": 40,
|
23 |
-
"max_position_embeddings":
|
24 |
-
"max_relative_positions":
|
25 |
"model_type": "deberta-v2",
|
26 |
-
"num_attention_heads":
|
27 |
"num_hidden_layers": 4,
|
28 |
"pad_token_id": 0,
|
29 |
"pooler_dropout": 0,
|
30 |
"pooler_hidden_act": "gelu",
|
31 |
-
"pooler_hidden_size":
|
32 |
"pos_att_type": null,
|
33 |
"position_biased_input": true,
|
34 |
-
"relative_attention":
|
35 |
"torch_dtype": "float32",
|
36 |
-
"transformers_version": "4.
|
37 |
"type_vocab_size": 0,
|
38 |
-
"vocab_size":
|
39 |
}
|
|
|
2 |
"architectures": [
|
3 |
"DebertaV2ForTokenClassification"
|
4 |
],
|
5 |
+
"attention_probs_dropout_prob": 0.2,
|
6 |
"hidden_act": "gelu",
|
7 |
+
"hidden_dropout_prob": 0.2,
|
8 |
+
"hidden_size": 256,
|
9 |
"id2label": {
|
10 |
"0": "NO",
|
11 |
"1": "PRIMARY",
|
12 |
"2": "SECONDARY"
|
13 |
},
|
14 |
"initializer_range": 0.02,
|
15 |
+
"intermediate_size": 1024,
|
16 |
"label2id": {
|
17 |
"NO": 0,
|
18 |
"PRIMARY": 1,
|
|
|
20 |
},
|
21 |
"layer_norm_eps": 1e-07,
|
22 |
"max_length": 40,
|
23 |
+
"max_position_embeddings": 42,
|
24 |
+
"max_relative_positions": 42,
|
25 |
"model_type": "deberta-v2",
|
26 |
+
"num_attention_heads": 8,
|
27 |
"num_hidden_layers": 4,
|
28 |
"pad_token_id": 0,
|
29 |
"pooler_dropout": 0,
|
30 |
"pooler_hidden_act": "gelu",
|
31 |
+
"pooler_hidden_size": 256,
|
32 |
"pos_att_type": null,
|
33 |
"position_biased_input": true,
|
34 |
+
"relative_attention": true,
|
35 |
"torch_dtype": "float32",
|
36 |
+
"transformers_version": "4.25.1",
|
37 |
"type_vocab_size": 0,
|
38 |
+
"vocab_size": 40
|
39 |
}
|
pytorch_model.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b93628caf4493a15351b7b17bfb6c4d77a26960f08ee247f8959b6eb70e7db24
|
3 |
+
size 12835213
|
special_tokens_map.json
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
{
|
2 |
-
"bos_token": "[
|
3 |
-
"eos_token": "[
|
4 |
-
"pad_token": "[
|
5 |
-
"unk_token": "[
|
6 |
}
|
|
|
1 |
{
|
2 |
+
"bos_token": "[bos]",
|
3 |
+
"eos_token": "[eos]",
|
4 |
+
"pad_token": "[pad]",
|
5 |
+
"unk_token": "[unk]"
|
6 |
}
|
tokenizer_config.json
CHANGED
@@ -1,11 +1,9 @@
|
|
1 |
{
|
2 |
-
"bos_token": "[
|
3 |
-
"
|
4 |
-
"
|
5 |
-
"
|
6 |
-
"
|
7 |
"tokenizer_class": "CharTokenizer",
|
8 |
-
"
|
9 |
-
"AutoTokenizer": ["char_tokenizer.CharTokenizer", null]
|
10 |
-
}
|
11 |
}
|
|
|
1 |
{
|
2 |
+
"bos_token": "[bos]",
|
3 |
+
"do_lower_case": true,
|
4 |
+
"eos_token": "[eos]",
|
5 |
+
"model_max_length": 1000000000000000019884624838656,
|
6 |
+
"pad_token": "[pad]",
|
7 |
"tokenizer_class": "CharTokenizer",
|
8 |
+
"unk_token": "[unk]"
|
|
|
|
|
9 |
}
|
vocab.txt
CHANGED
@@ -1,40 +1,10 @@
|
|
1 |
-
[
|
2 |
-
[
|
3 |
-
[
|
4 |
-
[
|
5 |
'
|
6 |
-
|
7 |
`
|
8 |
-
А
|
9 |
-
Б
|
10 |
-
В
|
11 |
-
Г
|
12 |
-
Д
|
13 |
-
Е
|
14 |
-
Ж
|
15 |
-
З
|
16 |
-
И
|
17 |
-
Й
|
18 |
-
К
|
19 |
-
Л
|
20 |
-
М
|
21 |
-
Н
|
22 |
-
О
|
23 |
-
П
|
24 |
-
Р
|
25 |
-
С
|
26 |
-
Т
|
27 |
-
У
|
28 |
-
Ф
|
29 |
-
Х
|
30 |
-
Ц
|
31 |
-
Ч
|
32 |
-
Ш
|
33 |
-
Щ
|
34 |
-
Ы
|
35 |
-
Э
|
36 |
-
Ю
|
37 |
-
Я
|
38 |
а
|
39 |
б
|
40 |
в
|
|
|
1 |
+
[pad]
|
2 |
+
[unk]
|
3 |
+
[bos]
|
4 |
+
[eos]
|
5 |
'
|
6 |
-
|
7 |
`
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
а
|
9 |
б
|
10 |
в
|