from transformers import BertTokenizerFast import os class MiniSunTokenizer: def __init__(self, vocab_file=None): # You can use BERT's tokenizer or any custom vocabulary tokenizer if vocab_file: self.tokenizer = BertTokenizerFast(vocab_file=vocab_file, do_lower_case=False) else: # Default BERT tokenizer without a specific vocab file self.tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased') # Define special tokens if needed (customizable) self.pad_token = '[PAD]' self.unk_token = '[UNK]' self.cls_token = '[CLS]' self.sep_token = '[SEP]' self.mask_token = '[MASK]' def tokenize(self, text): # Tokenizes the input text return self.tokenizer.tokenize(text) def encode(self, text, max_length=512, padding=True, truncation=True): # Converts the text into input IDs and attention mask encoded = self.tokenizer.encode_plus( text, add_special_tokens=True, max_length=max_length, padding='max_length' if padding else False, truncation=truncation, return_attention_mask=True, return_tensors='tf' ) return encoded['input_ids'], encoded['attention_mask'] def decode(self, token_ids): # Decodes token IDs back into text return self.tokenizer.decode(token_ids, skip_special_tokens=True) def save_pretrained(self, save_directory): # Save the tokenizer in Hugging Face format os.makedirs(save_directory, exist_ok=True) self.tokenizer.save_pretrained(save_directory) # Example usage of the tokenizer tokenizer = MiniSunTokenizer() text = "Hello, this is a test sentence for MiniSun model." input_ids, attention_mask = tokenizer.encode(text, max_length=20) print("Input IDs:", input_ids) print("Attention Mask:", attention_mask)