finnstrom3693 commited on
Commit
b5e64c5
1 Parent(s): b71846a

Create tokenizer_make2.py

Browse files
Files changed (1) hide show
  1. tokenizer_make2.py +80 -0
tokenizer_make2.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import BertTokenizerFast
2
+ import os
3
+ import tensorflow as tf
4
+
5
+ class MiniSunTokenizer:
6
+ def __init__(self, vocab_file=None):
7
+ if vocab_file:
8
+ self.tokenizer = BertTokenizerFast(vocab_file=vocab_file, do_lower_case=False)
9
+ else:
10
+ self.tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
11
+
12
+ # Define special tokens
13
+ self.pad_token = '[PAD]'
14
+ self.unk_token = '[UNK]'
15
+ self.cls_token = '[CLS]'
16
+ self.sep_token = '[SEP]'
17
+ self.mask_token = '[MASK]'
18
+ self.eos_token = '[EOS]'
19
+
20
+ def encode(self, text, max_length=512, padding=True, truncation=True):
21
+ """
22
+ Encodes the input text (string or batch of strings).
23
+ It automatically detects if the input is a batch or a single sentence.
24
+ """
25
+ if isinstance(text, list): # If batch of texts, call batch_encode_plus
26
+ return self._encode_batch(text, max_length, padding, truncation)
27
+ else: # Single text input
28
+ return self._encode_single(text, max_length, padding, truncation)
29
+
30
+ def _encode_single(self, text, max_length=512, padding=True, truncation=True):
31
+ # Encode a single string
32
+ encoded = self.tokenizer.encode_plus(
33
+ text,
34
+ add_special_tokens=True,
35
+ max_length=max_length,
36
+ padding='max_length' if padding else False,
37
+ truncation=truncation,
38
+ return_attention_mask=True,
39
+ return_tensors='tf'
40
+ )
41
+ return {
42
+ 'input_ids': encoded['input_ids'].numpy().tolist(),
43
+ 'attention_mask': encoded['attention_mask'].numpy().tolist()
44
+ }
45
+
46
+ def _encode_batch(self, texts, max_length=512, padding=True, truncation=True):
47
+ # Encode a batch of strings
48
+ encoded_batch = self.tokenizer.batch_encode_plus(
49
+ texts,
50
+ add_special_tokens=True,
51
+ max_length=max_length,
52
+ padding='max_length' if padding else False,
53
+ truncation=truncation,
54
+ return_attention_mask=True,
55
+ return_tensors='tf'
56
+ )
57
+ return {
58
+ 'input_ids': encoded_batch['input_ids'].numpy().tolist(),
59
+ 'attention_mask': encoded_batch['attention_mask'].numpy().tolist()
60
+ }
61
+
62
+ def decode(self, token_ids):
63
+ # Decodes token IDs back into text
64
+ return self.tokenizer.decode(token_ids, skip_special_tokens=True)
65
+
66
+ def save_pretrained(self, save_directory):
67
+ # Save the tokenizer in Hugging Face format
68
+ os.makedirs(save_directory, exist_ok=True)
69
+ self.tokenizer.save_pretrained(save_directory)
70
+
71
+ def __call__(self, text, *args, **kwargs):
72
+ """
73
+ This allows the tokenizer object to be called directly like `tokenizer(text)`.
74
+ It will automatically detect if the input is a batch or a single sentence.
75
+ """
76
+ return self.encode(text, *args, **kwargs)
77
+
78
+
79
+ # Example usage of the tokenizer
80
+ tokenizer = MiniSunTokenizer()