mini-sun-init-tf-110m / tokenizer_make2.py
finnstrom3693's picture
Create tokenizer_make2.py
b5e64c5 verified
raw
history blame contribute delete
No virus
3.02 kB
from transformers import BertTokenizerFast
import os
import tensorflow as tf
class MiniSunTokenizer:
def __init__(self, vocab_file=None):
if vocab_file:
self.tokenizer = BertTokenizerFast(vocab_file=vocab_file, do_lower_case=False)
else:
self.tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
# Define special tokens
self.pad_token = '[PAD]'
self.unk_token = '[UNK]'
self.cls_token = '[CLS]'
self.sep_token = '[SEP]'
self.mask_token = '[MASK]'
self.eos_token = '[EOS]'
def encode(self, text, max_length=512, padding=True, truncation=True):
"""
Encodes the input text (string or batch of strings).
It automatically detects if the input is a batch or a single sentence.
"""
if isinstance(text, list): # If batch of texts, call batch_encode_plus
return self._encode_batch(text, max_length, padding, truncation)
else: # Single text input
return self._encode_single(text, max_length, padding, truncation)
def _encode_single(self, text, max_length=512, padding=True, truncation=True):
# Encode a single string
encoded = self.tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=max_length,
padding='max_length' if padding else False,
truncation=truncation,
return_attention_mask=True,
return_tensors='tf'
)
return {
'input_ids': encoded['input_ids'].numpy().tolist(),
'attention_mask': encoded['attention_mask'].numpy().tolist()
}
def _encode_batch(self, texts, max_length=512, padding=True, truncation=True):
# Encode a batch of strings
encoded_batch = self.tokenizer.batch_encode_plus(
texts,
add_special_tokens=True,
max_length=max_length,
padding='max_length' if padding else False,
truncation=truncation,
return_attention_mask=True,
return_tensors='tf'
)
return {
'input_ids': encoded_batch['input_ids'].numpy().tolist(),
'attention_mask': encoded_batch['attention_mask'].numpy().tolist()
}
def decode(self, token_ids):
# Decodes token IDs back into text
return self.tokenizer.decode(token_ids, skip_special_tokens=True)
def save_pretrained(self, save_directory):
# Save the tokenizer in Hugging Face format
os.makedirs(save_directory, exist_ok=True)
self.tokenizer.save_pretrained(save_directory)
def __call__(self, text, *args, **kwargs):
"""
This allows the tokenizer object to be called directly like `tokenizer(text)`.
It will automatically detect if the input is a batch or a single sentence.
"""
return self.encode(text, *args, **kwargs)
# Example usage of the tokenizer
tokenizer = MiniSunTokenizer()