IlyaGusev commited on
Commit
62b5660
·
1 Parent(s): ce03021

New version

Browse files
char_tokenizer.py CHANGED
@@ -3,7 +3,7 @@ from typing import Optional, Tuple, List
3
  from collections import OrderedDict
4
 
5
  from torch.utils.data import Dataset
6
- from transformers import PreTrainedTokenizer
7
 
8
 
9
  def load_vocab(vocab_file):
@@ -22,10 +22,11 @@ class CharTokenizer(PreTrainedTokenizer):
22
  def __init__(
23
  self,
24
  vocab_file=None,
25
- pad_token="[PAD]",
26
- unk_token="[UNK]",
27
- bos_token="[BOS]",
28
- eos_token="[EOS]",
 
29
  *args,
30
  **kwargs
31
  ):
@@ -34,8 +35,10 @@ class CharTokenizer(PreTrainedTokenizer):
34
  unk_token=unk_token,
35
  bos_token=bos_token,
36
  eos_token=eos_token,
 
37
  **kwargs
38
  )
 
39
 
40
  if not vocab_file or not os.path.isfile(vocab_file):
41
  self.vocab = OrderedDict()
@@ -49,6 +52,8 @@ class CharTokenizer(PreTrainedTokenizer):
49
  with open(file_path) as r:
50
  for line in r:
51
  word = line.strip()
 
 
52
  vocab |= set(word)
53
  vocab = list(vocab)
54
  vocab.sort()
@@ -67,12 +72,16 @@ class CharTokenizer(PreTrainedTokenizer):
67
  return self.vocab
68
 
69
  def _convert_token_to_id(self, token):
70
- return self.vocab.get(token)
 
 
71
 
72
  def _convert_id_to_token(self, index):
73
  return self.ids_to_tokens[index]
74
 
75
  def _tokenize(self, text):
 
 
76
  return list(text)
77
 
78
  def convert_tokens_to_string(self, tokens):
@@ -119,3 +128,5 @@ class CharTokenizer(PreTrainedTokenizer):
119
  writer.write(token + "\n")
120
  index += 1
121
  return (vocab_file,)
 
 
 
3
  from collections import OrderedDict
4
 
5
  from torch.utils.data import Dataset
6
+ from transformers import PreTrainedTokenizer, AutoTokenizer
7
 
8
 
9
  def load_vocab(vocab_file):
 
22
  def __init__(
23
  self,
24
  vocab_file=None,
25
+ pad_token="[pad]",
26
+ unk_token="[unk]",
27
+ bos_token="[bos]",
28
+ eos_token="[eos]",
29
+ do_lower_case=False,
30
  *args,
31
  **kwargs
32
  ):
 
35
  unk_token=unk_token,
36
  bos_token=bos_token,
37
  eos_token=eos_token,
38
+ do_lower_case=do_lower_case,
39
  **kwargs
40
  )
41
+ self.do_lower_case = do_lower_case
42
 
43
  if not vocab_file or not os.path.isfile(vocab_file):
44
  self.vocab = OrderedDict()
 
52
  with open(file_path) as r:
53
  for line in r:
54
  word = line.strip()
55
+ if self.do_lower_case:
56
+ word = word.lower()
57
  vocab |= set(word)
58
  vocab = list(vocab)
59
  vocab.sort()
 
72
  return self.vocab
73
 
74
  def _convert_token_to_id(self, token):
75
+ if self.do_lower_case:
76
+ token = token.lower()
77
+ return self.vocab.get(token, self.vocab[self.unk_token])
78
 
79
  def _convert_id_to_token(self, index):
80
  return self.ids_to_tokens[index]
81
 
82
  def _tokenize(self, text):
83
+ if self.do_lower_case:
84
+ text = text.lower()
85
  return list(text)
86
 
87
  def convert_tokens_to_string(self, tokens):
 
128
  writer.write(token + "\n")
129
  index += 1
130
  return (vocab_file,)
131
+
132
+ AutoTokenizer.register("char_tokenizer", CharTokenizer)
config.json CHANGED
@@ -2,17 +2,17 @@
2
  "architectures": [
3
  "DebertaV2ForTokenClassification"
4
  ],
5
- "attention_probs_dropout_prob": 0.1,
6
  "hidden_act": "gelu",
7
- "hidden_dropout_prob": 0.1,
8
- "hidden_size": 128,
9
  "id2label": {
10
  "0": "NO",
11
  "1": "PRIMARY",
12
  "2": "SECONDARY"
13
  },
14
  "initializer_range": 0.02,
15
- "intermediate_size": 512,
16
  "label2id": {
17
  "NO": 0,
18
  "PRIMARY": 1,
@@ -20,20 +20,20 @@
20
  },
21
  "layer_norm_eps": 1e-07,
22
  "max_length": 40,
23
- "max_position_embeddings": 64,
24
- "max_relative_positions": -1,
25
  "model_type": "deberta-v2",
26
- "num_attention_heads": 4,
27
  "num_hidden_layers": 4,
28
  "pad_token_id": 0,
29
  "pooler_dropout": 0,
30
  "pooler_hidden_act": "gelu",
31
- "pooler_hidden_size": 128,
32
  "pos_att_type": null,
33
  "position_biased_input": true,
34
- "relative_attention": false,
35
  "torch_dtype": "float32",
36
- "transformers_version": "4.20.1",
37
  "type_vocab_size": 0,
38
- "vocab_size": 70
39
  }
 
2
  "architectures": [
3
  "DebertaV2ForTokenClassification"
4
  ],
5
+ "attention_probs_dropout_prob": 0.2,
6
  "hidden_act": "gelu",
7
+ "hidden_dropout_prob": 0.2,
8
+ "hidden_size": 256,
9
  "id2label": {
10
  "0": "NO",
11
  "1": "PRIMARY",
12
  "2": "SECONDARY"
13
  },
14
  "initializer_range": 0.02,
15
+ "intermediate_size": 1024,
16
  "label2id": {
17
  "NO": 0,
18
  "PRIMARY": 1,
 
20
  },
21
  "layer_norm_eps": 1e-07,
22
  "max_length": 40,
23
+ "max_position_embeddings": 42,
24
+ "max_relative_positions": 42,
25
  "model_type": "deberta-v2",
26
+ "num_attention_heads": 8,
27
  "num_hidden_layers": 4,
28
  "pad_token_id": 0,
29
  "pooler_dropout": 0,
30
  "pooler_hidden_act": "gelu",
31
+ "pooler_hidden_size": 256,
32
  "pos_att_type": null,
33
  "position_biased_input": true,
34
+ "relative_attention": true,
35
  "torch_dtype": "float32",
36
+ "transformers_version": "4.25.1",
37
  "type_vocab_size": 0,
38
+ "vocab_size": 40
39
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f168b92d2481d3e465ef0c3579cd771c3dbde8188b90efe582cc4468085a9807
3
- size 3267088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b93628caf4493a15351b7b17bfb6c4d77a26960f08ee247f8959b6eb70e7db24
3
+ size 12835213
special_tokens_map.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
- "bos_token": "[BOS]",
3
- "eos_token": "[EOS]",
4
- "pad_token": "[PAD]",
5
- "unk_token": "[UNK]"
6
  }
 
1
  {
2
+ "bos_token": "[bos]",
3
+ "eos_token": "[eos]",
4
+ "pad_token": "[pad]",
5
+ "unk_token": "[unk]"
6
  }
tokenizer_config.json CHANGED
@@ -1,11 +1,9 @@
1
  {
2
- "bos_token": "[BOS]",
3
- "eos_token": "[EOS]",
4
- "pad_token": "[PAD]",
5
- "unk_token": "[UNK]",
6
- "model_max_length": 40,
7
  "tokenizer_class": "CharTokenizer",
8
- "auto_map": {
9
- "AutoTokenizer": ["char_tokenizer.CharTokenizer", null]
10
- }
11
  }
 
1
  {
2
+ "bos_token": "[bos]",
3
+ "do_lower_case": true,
4
+ "eos_token": "[eos]",
5
+ "model_max_length": 1000000000000000019884624838656,
6
+ "pad_token": "[pad]",
7
  "tokenizer_class": "CharTokenizer",
8
+ "unk_token": "[unk]"
 
 
9
  }
vocab.txt CHANGED
@@ -1,40 +1,10 @@
1
- [PAD]
2
- [UNK]
3
- [BOS]
4
- [EOS]
5
  '
6
  -
7
  `
8
- А
9
- Б
10
- В
11
- Г
12
- Д
13
- Е
14
- Ж
15
- З
16
- И
17
- Й
18
- К
19
- Л
20
- М
21
- Н
22
- О
23
- П
24
- Р
25
- С
26
- Т
27
- У
28
- Ф
29
- Х
30
- Ц
31
- Ч
32
- Ш
33
- Щ
34
- Ы
35
- Э
36
- Ю
37
- Я
38
  а
39
  б
40
  в
 
1
+ [pad]
2
+ [unk]
3
+ [bos]
4
+ [eos]
5
  '
6
  -
7
  `
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  а
9
  б
10
  в