finnstrom3693 commited on
Commit
b71846a
1 Parent(s): d3801be

Create tokenizer_make.py

Browse files
Files changed (1) hide show
  1. tokenizer_make.py +53 -0
tokenizer_make.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import BertTokenizerFast
2
+ import os
3
+
4
+ class MiniSunTokenizer:
5
+ def __init__(self, vocab_file=None):
6
+ # You can use BERT's tokenizer or any custom vocabulary tokenizer
7
+ if vocab_file:
8
+ self.tokenizer = BertTokenizerFast(vocab_file=vocab_file, do_lower_case=False)
9
+ else:
10
+ # Default BERT tokenizer without a specific vocab file
11
+ self.tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
12
+
13
+ # Define special tokens if needed (customizable)
14
+ self.pad_token = '[PAD]'
15
+ self.unk_token = '[UNK]'
16
+ self.cls_token = '[CLS]'
17
+ self.sep_token = '[SEP]'
18
+ self.mask_token = '[MASK]'
19
+
20
+ def tokenize(self, text):
21
+ # Tokenizes the input text
22
+ return self.tokenizer.tokenize(text)
23
+
24
+ def encode(self, text, max_length=512, padding=True, truncation=True):
25
+ # Converts the text into input IDs and attention mask
26
+ encoded = self.tokenizer.encode_plus(
27
+ text,
28
+ add_special_tokens=True,
29
+ max_length=max_length,
30
+ padding='max_length' if padding else False,
31
+ truncation=truncation,
32
+ return_attention_mask=True,
33
+ return_tensors='tf'
34
+ )
35
+ return encoded['input_ids'], encoded['attention_mask']
36
+
37
+ def decode(self, token_ids):
38
+ # Decodes token IDs back into text
39
+ return self.tokenizer.decode(token_ids, skip_special_tokens=True)
40
+
41
+ def save_pretrained(self, save_directory):
42
+ # Save the tokenizer in Hugging Face format
43
+ os.makedirs(save_directory, exist_ok=True)
44
+ self.tokenizer.save_pretrained(save_directory)
45
+
46
+ # Example usage of the tokenizer
47
+ tokenizer = MiniSunTokenizer()
48
+
49
+ text = "Hello, this is a test sentence for MiniSun model."
50
+ input_ids, attention_mask = tokenizer.encode(text, max_length=20)
51
+
52
+ print("Input IDs:", input_ids)
53
+ print("Attention Mask:", attention_mask)