finnstrom3693
commited on
Commit
•
b71846a
1
Parent(s):
d3801be
Create tokenizer_make.py
Browse files- tokenizer_make.py +53 -0
tokenizer_make.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import BertTokenizerFast
|
2 |
+
import os
|
3 |
+
|
4 |
+
class MiniSunTokenizer:
|
5 |
+
def __init__(self, vocab_file=None):
|
6 |
+
# You can use BERT's tokenizer or any custom vocabulary tokenizer
|
7 |
+
if vocab_file:
|
8 |
+
self.tokenizer = BertTokenizerFast(vocab_file=vocab_file, do_lower_case=False)
|
9 |
+
else:
|
10 |
+
# Default BERT tokenizer without a specific vocab file
|
11 |
+
self.tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
|
12 |
+
|
13 |
+
# Define special tokens if needed (customizable)
|
14 |
+
self.pad_token = '[PAD]'
|
15 |
+
self.unk_token = '[UNK]'
|
16 |
+
self.cls_token = '[CLS]'
|
17 |
+
self.sep_token = '[SEP]'
|
18 |
+
self.mask_token = '[MASK]'
|
19 |
+
|
20 |
+
def tokenize(self, text):
|
21 |
+
# Tokenizes the input text
|
22 |
+
return self.tokenizer.tokenize(text)
|
23 |
+
|
24 |
+
def encode(self, text, max_length=512, padding=True, truncation=True):
|
25 |
+
# Converts the text into input IDs and attention mask
|
26 |
+
encoded = self.tokenizer.encode_plus(
|
27 |
+
text,
|
28 |
+
add_special_tokens=True,
|
29 |
+
max_length=max_length,
|
30 |
+
padding='max_length' if padding else False,
|
31 |
+
truncation=truncation,
|
32 |
+
return_attention_mask=True,
|
33 |
+
return_tensors='tf'
|
34 |
+
)
|
35 |
+
return encoded['input_ids'], encoded['attention_mask']
|
36 |
+
|
37 |
+
def decode(self, token_ids):
|
38 |
+
# Decodes token IDs back into text
|
39 |
+
return self.tokenizer.decode(token_ids, skip_special_tokens=True)
|
40 |
+
|
41 |
+
def save_pretrained(self, save_directory):
|
42 |
+
# Save the tokenizer in Hugging Face format
|
43 |
+
os.makedirs(save_directory, exist_ok=True)
|
44 |
+
self.tokenizer.save_pretrained(save_directory)
|
45 |
+
|
46 |
+
# Example usage of the tokenizer
|
47 |
+
tokenizer = MiniSunTokenizer()
|
48 |
+
|
49 |
+
text = "Hello, this is a test sentence for MiniSun model."
|
50 |
+
input_ids, attention_mask = tokenizer.encode(text, max_length=20)
|
51 |
+
|
52 |
+
print("Input IDs:", input_ids)
|
53 |
+
print("Attention Mask:", attention_mask)
|