pasha commited on
Commit
b820243
·
1 Parent(s): 09becb5

Initial commit

Browse files
Files changed (4) hide show
  1. special_tokens_map.json +6 -0
  2. tokenizer.json +0 -0
  3. tokenizer.py +159 -0
  4. tokenizer_config.json +47 -0
special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "pad_token": "<pad>",
5
+ "unk_token": "<unk>"
6
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import re
4
+ from typing import List
5
+
6
+ from tokenizers import pre_tokenizers, decoders, NormalizedString, PreTokenizedString
7
+ from transformers import PreTrainedTokenizerFast
8
+
9
+ from rumorpheme import RuMorphemeModel, labels_to_morphemes
10
+
11
+ DEFAULT_MODEL_NAME = "evilfreelancer/ruMorpheme-v0.1"
12
+ PAD, BEGIN, END, UNKNOWN, SPACE, SYSTEM, USER, ASSISTANT, FUNCTION_CALL, FUNCTION_RESPONSE = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
13
+ AUXILIARY = ["<pad>", "<s>", "</s>", "<unk>", " ", "system", "user", "assistant", "function_call", "function_response"]
14
+ NUMBERS = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
15
+
16
+
17
+ class RuMorphemePreTokenizer:
18
+ """
19
+ Pre-tokenizer for RuMorpheme model.
20
+ Splits on spaces and includes spaces as tokens.
21
+ Then, applies morpheme splitting to non-space tokens.
22
+ """
23
+
24
+ def __init__(self, model_name: str = DEFAULT_MODEL_NAME):
25
+ self.model = RuMorphemeModel.from_pretrained(model_name)
26
+ self.model.eval()
27
+
28
+ def pre_tokenize(self, pretok: PreTokenizedString):
29
+ # First, split on spaces and include spaces as tokens
30
+ pretok.split(self.split_on_spaces)
31
+ # Then, apply morpheme splitting to non-space tokens
32
+ pretok.split(self.morpheme_split)
33
+
34
+ def split_on_spaces(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
35
+ """
36
+ Splits on spaces and includes spaces as tokens.
37
+ TODO: Need to make performance tests on this function.
38
+ """
39
+ text = str(normalized_string)
40
+ splits = [NormalizedString(match.group()) for match in re.finditer(r'\s+|\S+', text)]
41
+ return splits
42
+
43
+ def morpheme_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
44
+ """
45
+ Split word on morphemes, including numbers and punctuation.
46
+ """
47
+ word = str(normalized_string)
48
+
49
+ # If word is just spaces, return as is
50
+ if word.isspace():
51
+ return [normalized_string]
52
+
53
+ # Ignore special characters (non-alphabetical and non-numeric)
54
+ if not any(c.isalpha() or c.isdigit() for c in word):
55
+ return [normalized_string]
56
+
57
+ # Make predictions and return morphemes
58
+ all_predictions, all_log_probs = self.model.predict([word])
59
+ morphs, morph_types, _ = labels_to_morphemes(word.lower(), all_predictions[0], all_log_probs[0])
60
+ return [NormalizedString(f"{morph_type}/{morph}") for morph, morph_type in zip(morphs, morph_types)]
61
+
62
+
63
+ class RuMorphemeDecoder:
64
+ """
65
+ Custom decoder for RuMorpheme model, it removes morph_type prefix from tokens and keep spaces.
66
+ """
67
+
68
+ def decode_chain(self, tokens: List[str]) -> List[str]:
69
+ """
70
+ tokenizer.decode function calls this function
71
+ """
72
+ decoded_tokens = []
73
+ for token in tokens:
74
+ # If token is a space, keep it as is
75
+ if token.isspace():
76
+ decoded_tokens.append(token)
77
+ else:
78
+ # Remove morph_type prefix if present
79
+ if '/' in token:
80
+ _, morph = token.split('/', 1)
81
+ else:
82
+ morph = token
83
+ decoded_tokens.append(morph)
84
+ return decoded_tokens
85
+
86
+
87
+ class RuMorphemeTokenizerFast(PreTrainedTokenizerFast):
88
+ def __init__(self, *args, **kwargs):
89
+ super().__init__(*args, **kwargs)
90
+
91
+ # If pre-tokenizer nodel is not specified, use the default
92
+ self.model_name = kwargs.get('model_name')
93
+ if kwargs.get('model_name') is None:
94
+ self.model_name: str = DEFAULT_MODEL_NAME
95
+
96
+ # Complete initialization
97
+ self.init_backend_tokenizer()
98
+
99
+ def init_backend_tokenizer(self):
100
+ # Custom pre-tokenizer
101
+ self.backend_tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
102
+ pre_tokenizers.Punctuation(),
103
+ pre_tokenizers.PreTokenizer.custom(RuMorphemePreTokenizer(self.model_name))
104
+ ])
105
+ # Custom decoder
106
+ self.backend_tokenizer.decoder = decoders.Decoder.custom(RuMorphemeDecoder())
107
+
108
+ def save_pretrained(self, save_directory, **kwargs):
109
+ # Temporarily remove the custom pre-tokenizer and decoder before saving
110
+ original_pre_tokenizer = self.backend_tokenizer.pre_tokenizer
111
+ original_decoder = self.backend_tokenizer.decoder
112
+ self.backend_tokenizer.pre_tokenizer = None
113
+ self.backend_tokenizer.decoder = None
114
+
115
+ # Save the tokenizer using the parent method
116
+ super().save_pretrained(save_directory, **kwargs)
117
+
118
+ # Re-attach the custom pre-tokenizer and decoder
119
+ self.backend_tokenizer.pre_tokenizer = original_pre_tokenizer
120
+ self.backend_tokenizer.decoder = original_decoder
121
+
122
+ # Save the tokenizer class name in tokenizer_config.json
123
+ tokenizer_config_file = os.path.join(save_directory, 'tokenizer_config.json')
124
+ if os.path.isfile(tokenizer_config_file):
125
+ with open(tokenizer_config_file, 'r', encoding='utf-8') as f:
126
+ tokenizer_config = json.load(f)
127
+ else:
128
+ tokenizer_config = {}
129
+
130
+ # Correctly specify the tokenizer_class with module name
131
+ tokenizer_config['tokenizer_class'] = "RuMorphemeTokenizerFast"
132
+ tokenizer_config['use_fast'] = True
133
+ tokenizer_config['auto_map'] = {"AutoTokenizer": ["", "my_tokenizer.RuMorphemeTokenizerFast"]}
134
+
135
+ with open(tokenizer_config_file, 'w', encoding='utf-8') as f:
136
+ json.dump(tokenizer_config, f, ensure_ascii=False)
137
+
138
+ @classmethod
139
+ def from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs):
140
+ # Load the tokenizer using the parent method
141
+ tokenizer = super(RuMorphemeTokenizerFast, cls).from_pretrained(
142
+ pretrained_model_name_or_path, *init_inputs, **kwargs
143
+ )
144
+
145
+ # If pre-tokenizer nodel is not specified, use the default
146
+ model_name = kwargs.get('model_name')
147
+ if kwargs.get('model_name') is None:
148
+ model_name: str = DEFAULT_MODEL_NAME
149
+
150
+ # Custom pre-tokenizer
151
+ tokenizer.backend_tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
152
+ pre_tokenizers.Punctuation(),
153
+ pre_tokenizers.PreTokenizer.custom(RuMorphemePreTokenizer(model_name))
154
+ ])
155
+
156
+ # Custom decoder
157
+ tokenizer.backend_tokenizer.decoder = decoders.Decoder.custom(RuMorphemeDecoder())
158
+
159
+ return tokenizer
tokenizer_config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<pad>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<s>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ }
35
+ },
36
+ "bos_token": "<s>",
37
+ "clean_up_tokenization_spaces": false,
38
+ "eos_token": "</s>",
39
+ "model_max_length": 1000000000000000019884624838656,
40
+ "pad_token": "<pad>",
41
+ "tokenizer_class": "RuMorphemeTokenizerFast",
42
+ "unk_token": "<unk>",
43
+ "use_fast": true,
44
+ "auto_map": {
45
+ "AutoTokenizer": ["","tokenizer.RuMorphemeTokenizerFast"]
46
+ }
47
+ }