File size: 3,020 Bytes
b5e64c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
from transformers import BertTokenizerFast
import os
import tensorflow as tf

class MiniSunTokenizer:
    def __init__(self, vocab_file=None):
        if vocab_file:
            self.tokenizer = BertTokenizerFast(vocab_file=vocab_file, do_lower_case=False)
        else:
            self.tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

        # Define special tokens
        self.pad_token = '[PAD]'
        self.unk_token = '[UNK]'
        self.cls_token = '[CLS]'
        self.sep_token = '[SEP]'
        self.mask_token = '[MASK]'
        self.eos_token = '[EOS]'

    def encode(self, text, max_length=512, padding=True, truncation=True):
        """
        Encodes the input text (string or batch of strings).
        It automatically detects if the input is a batch or a single sentence.
        """
        if isinstance(text, list):  # If batch of texts, call batch_encode_plus
            return self._encode_batch(text, max_length, padding, truncation)
        else:  # Single text input
            return self._encode_single(text, max_length, padding, truncation)

    def _encode_single(self, text, max_length=512, padding=True, truncation=True):
        # Encode a single string
        encoded = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,
            padding='max_length' if padding else False,
            truncation=truncation,
            return_attention_mask=True,
            return_tensors='tf'
        )
        return {
            'input_ids': encoded['input_ids'].numpy().tolist(),
            'attention_mask': encoded['attention_mask'].numpy().tolist()
        }

    def _encode_batch(self, texts, max_length=512, padding=True, truncation=True):
        # Encode a batch of strings
        encoded_batch = self.tokenizer.batch_encode_plus(
            texts,
            add_special_tokens=True,
            max_length=max_length,
            padding='max_length' if padding else False,
            truncation=truncation,
            return_attention_mask=True,
            return_tensors='tf'
        )
        return {
            'input_ids': encoded_batch['input_ids'].numpy().tolist(),
            'attention_mask': encoded_batch['attention_mask'].numpy().tolist()
        }

    def decode(self, token_ids):
        # Decodes token IDs back into text
        return self.tokenizer.decode(token_ids, skip_special_tokens=True)

    def save_pretrained(self, save_directory):
        # Save the tokenizer in Hugging Face format
        os.makedirs(save_directory, exist_ok=True)
        self.tokenizer.save_pretrained(save_directory)

    def __call__(self, text, *args, **kwargs):
        """
        This allows the tokenizer object to be called directly like `tokenizer(text)`.
        It will automatically detect if the input is a batch or a single sentence.
        """
        return self.encode(text, *args, **kwargs)


# Example usage of the tokenizer
tokenizer = MiniSunTokenizer()