Small files
Browse files- config.json +39 -0
- special_tokens_map.json +6 -0
- tokenizer.py +121 -0
- tokenizer_config.json +7 -0
- vocab.txt +71 -0
config.json
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"DebertaV2ForTokenClassification"
|
4 |
+
],
|
5 |
+
"attention_probs_dropout_prob": 0.1,
|
6 |
+
"hidden_act": "gelu",
|
7 |
+
"hidden_dropout_prob": 0.1,
|
8 |
+
"hidden_size": 128,
|
9 |
+
"id2label": {
|
10 |
+
"0": "NO",
|
11 |
+
"1": "PRIMARY",
|
12 |
+
"2": "SECONDARY"
|
13 |
+
},
|
14 |
+
"initializer_range": 0.02,
|
15 |
+
"intermediate_size": 512,
|
16 |
+
"label2id": {
|
17 |
+
"NO": 0,
|
18 |
+
"PRIMARY": 1,
|
19 |
+
"SECONDARY": 2
|
20 |
+
},
|
21 |
+
"layer_norm_eps": 1e-07,
|
22 |
+
"max_length": 40,
|
23 |
+
"max_position_embeddings": 64,
|
24 |
+
"max_relative_positions": -1,
|
25 |
+
"model_type": "deberta-v2",
|
26 |
+
"num_attention_heads": 4,
|
27 |
+
"num_hidden_layers": 4,
|
28 |
+
"pad_token_id": 0,
|
29 |
+
"pooler_dropout": 0,
|
30 |
+
"pooler_hidden_act": "gelu",
|
31 |
+
"pooler_hidden_size": 128,
|
32 |
+
"pos_att_type": null,
|
33 |
+
"position_biased_input": true,
|
34 |
+
"relative_attention": false,
|
35 |
+
"torch_dtype": "float32",
|
36 |
+
"transformers_version": "4.20.1",
|
37 |
+
"type_vocab_size": 0,
|
38 |
+
"vocab_size": 71
|
39 |
+
}
|
special_tokens_map.json
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": "[BOS]",
|
3 |
+
"eos_token": "[EOS]",
|
4 |
+
"pad_token": "[PAD]",
|
5 |
+
"unk_token": "[UNK]"
|
6 |
+
}
|
tokenizer.py
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from typing import Optional, Tuple, List
|
3 |
+
from collections import OrderedDict
|
4 |
+
|
5 |
+
from torch.utils.data import Dataset
|
6 |
+
from transformers import PreTrainedTokenizer
|
7 |
+
|
8 |
+
|
9 |
+
def load_vocab(vocab_file):
|
10 |
+
vocab = OrderedDict()
|
11 |
+
with open(vocab_file, "r", encoding="utf-8") as reader:
|
12 |
+
tokens = reader.readlines()
|
13 |
+
for index, token in enumerate(tokens):
|
14 |
+
token = token.rstrip("\n")
|
15 |
+
vocab[token] = index
|
16 |
+
return vocab
|
17 |
+
|
18 |
+
|
19 |
+
class CharTokenizer(PreTrainedTokenizer):
|
20 |
+
vocab_files_names = {"vocab_file": "vocab.txt"}
|
21 |
+
|
22 |
+
def __init__(
|
23 |
+
self,
|
24 |
+
vocab_file=None,
|
25 |
+
pad_token="[PAD]",
|
26 |
+
unk_token="[UNK]",
|
27 |
+
bos_token="[BOS]",
|
28 |
+
eos_token="[EOS]",
|
29 |
+
*args,
|
30 |
+
**kwargs
|
31 |
+
):
|
32 |
+
super().__init__(
|
33 |
+
pad_token=pad_token,
|
34 |
+
unk_token=unk_token,
|
35 |
+
bos_token=bos_token,
|
36 |
+
eos_token=eos_token,
|
37 |
+
**kwargs
|
38 |
+
)
|
39 |
+
|
40 |
+
if not vocab_file or not os.path.isfile(vocab_file):
|
41 |
+
self.vocab = OrderedDict()
|
42 |
+
self.ids_to_tokens = OrderedDict()
|
43 |
+
else:
|
44 |
+
self.vocab = load_vocab(vocab_file)
|
45 |
+
self.ids_to_tokens = OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
|
46 |
+
|
47 |
+
def train(self, file_path):
|
48 |
+
vocab = set()
|
49 |
+
with open(file_path) as r:
|
50 |
+
for line in r:
|
51 |
+
word = line.strip()
|
52 |
+
vocab |= set(word)
|
53 |
+
vocab = list(vocab)
|
54 |
+
vocab.sort()
|
55 |
+
special_tokens = [self.pad_token, self.unk_token, self.bos_token, self.eos_token]
|
56 |
+
vocab = special_tokens + vocab
|
57 |
+
|
58 |
+
for i, ch in enumerate(vocab):
|
59 |
+
self.vocab[ch] = i
|
60 |
+
self.ids_to_tokens = vocab
|
61 |
+
|
62 |
+
@property
|
63 |
+
def vocab_size(self):
|
64 |
+
return len(self.vocab)
|
65 |
+
|
66 |
+
def get_vocab(self):
|
67 |
+
return self.vocab
|
68 |
+
|
69 |
+
def _convert_token_to_id(self, token):
|
70 |
+
return self.vocab.get(token)
|
71 |
+
|
72 |
+
def _convert_id_to_token(self, index):
|
73 |
+
return self.ids_to_tokens[index]
|
74 |
+
|
75 |
+
def _tokenize(self, text):
|
76 |
+
return list(text)
|
77 |
+
|
78 |
+
def convert_tokens_to_string(self, tokens):
|
79 |
+
return "".join(tokens)
|
80 |
+
|
81 |
+
def build_inputs_with_special_tokens(
|
82 |
+
self,
|
83 |
+
token_ids_0: List[int],
|
84 |
+
token_ids_1: Optional[List[int]] = None
|
85 |
+
) -> List[int]:
|
86 |
+
bos = [self.bos_token_id]
|
87 |
+
eos = [self.eos_token_id]
|
88 |
+
return bos + token_ids_0 + eos
|
89 |
+
|
90 |
+
def get_special_tokens_mask(
|
91 |
+
self,
|
92 |
+
token_ids_0: List[int],
|
93 |
+
token_ids_1: Optional[List[int]] = None
|
94 |
+
) -> List[int]:
|
95 |
+
return [1] + ([0] * len(token_ids_0)) + [1]
|
96 |
+
|
97 |
+
def create_token_type_ids_from_sequences(
|
98 |
+
self,
|
99 |
+
token_ids_0: List[int],
|
100 |
+
token_ids_1: Optional[List[int]] = None
|
101 |
+
) -> List[int]:
|
102 |
+
return (len(token_ids_0) + 2) * [0]
|
103 |
+
|
104 |
+
def save_vocabulary(
|
105 |
+
self,
|
106 |
+
save_directory: str,
|
107 |
+
filename_prefix: Optional[str] = None
|
108 |
+
) -> Tuple[str]:
|
109 |
+
assert os.path.isdir(save_directory)
|
110 |
+
vocab_file = os.path.join(
|
111 |
+
save_directory,
|
112 |
+
(filename_prefix + "-" if filename_prefix else "") +
|
113 |
+
self.vocab_files_names["vocab_file"]
|
114 |
+
)
|
115 |
+
index = 0
|
116 |
+
with open(vocab_file, "w", encoding="utf-8") as writer:
|
117 |
+
for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
|
118 |
+
assert index == token_index
|
119 |
+
writer.write(token + "\n")
|
120 |
+
index += 1
|
121 |
+
return (vocab_file,)
|
tokenizer_config.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": "[BOS]",
|
3 |
+
"eos_token": "[EOS]",
|
4 |
+
"pad_token": "[PAD]",
|
5 |
+
"tokenizer_class": "CharTokenizer",
|
6 |
+
"unk_token": "[UNK]"
|
7 |
+
}
|
vocab.txt
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[PAD]
|
2 |
+
[UNK]
|
3 |
+
[BOS]
|
4 |
+
[EOS]
|
5 |
+
'
|
6 |
+
-
|
7 |
+
`
|
8 |
+
Ё
|
9 |
+
А
|
10 |
+
Б
|
11 |
+
В
|
12 |
+
Г
|
13 |
+
Д
|
14 |
+
Е
|
15 |
+
Ж
|
16 |
+
З
|
17 |
+
И
|
18 |
+
Й
|
19 |
+
К
|
20 |
+
Л
|
21 |
+
М
|
22 |
+
Н
|
23 |
+
О
|
24 |
+
П
|
25 |
+
Р
|
26 |
+
С
|
27 |
+
Т
|
28 |
+
У
|
29 |
+
Ф
|
30 |
+
Х
|
31 |
+
Ц
|
32 |
+
Ч
|
33 |
+
Ш
|
34 |
+
Щ
|
35 |
+
Ы
|
36 |
+
Э
|
37 |
+
Ю
|
38 |
+
Я
|
39 |
+
а
|
40 |
+
б
|
41 |
+
в
|
42 |
+
г
|
43 |
+
д
|
44 |
+
е
|
45 |
+
ж
|
46 |
+
з
|
47 |
+
и
|
48 |
+
й
|
49 |
+
к
|
50 |
+
л
|
51 |
+
м
|
52 |
+
н
|
53 |
+
о
|
54 |
+
п
|
55 |
+
р
|
56 |
+
с
|
57 |
+
т
|
58 |
+
у
|
59 |
+
ф
|
60 |
+
х
|
61 |
+
ц
|
62 |
+
ч
|
63 |
+
ш
|
64 |
+
щ
|
65 |
+
ъ
|
66 |
+
ы
|
67 |
+
ь
|
68 |
+
э
|
69 |
+
ю
|
70 |
+
я
|
71 |
+
ё
|