finnstrom3693
/

mini-sun-init-tf-110m

Model card Files Files and versions Community

mini-sun-init-tf-110m / tokenizer_make2.py

finnstrom3693's picture

Create tokenizer_make2.py

b5e64c5 verified 2 days ago

history blame contribute delete

No virus

3.02 kB

	from transformers import BertTokenizerFast
	import os
	import tensorflow as tf

	class MiniSunTokenizer:
	def __init__(self, vocab_file=None):
	if vocab_file:
	self.tokenizer = BertTokenizerFast(vocab_file=vocab_file, do_lower_case=False)
	else:
	self.tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

	# Define special tokens
	self.pad_token = '[PAD]'
	self.unk_token = '[UNK]'
	self.cls_token = '[CLS]'
	self.sep_token = '[SEP]'
	self.mask_token = '[MASK]'
	self.eos_token = '[EOS]'

	def encode(self, text, max_length=512, padding=True, truncation=True):
	"""
	Encodes the input text (string or batch of strings).
	It automatically detects if the input is a batch or a single sentence.
	"""
	if isinstance(text, list): # If batch of texts, call batch_encode_plus
	return self._encode_batch(text, max_length, padding, truncation)
	else: # Single text input
	return self._encode_single(text, max_length, padding, truncation)

	def _encode_single(self, text, max_length=512, padding=True, truncation=True):
	# Encode a single string
	encoded = self.tokenizer.encode_plus(
	text,
	add_special_tokens=True,
	max_length=max_length,
	padding='max_length' if padding else False,
	truncation=truncation,
	return_attention_mask=True,
	return_tensors='tf'
	)
	return {
	'input_ids': encoded['input_ids'].numpy().tolist(),
	'attention_mask': encoded['attention_mask'].numpy().tolist()
	}

	def _encode_batch(self, texts, max_length=512, padding=True, truncation=True):
	# Encode a batch of strings
	encoded_batch = self.tokenizer.batch_encode_plus(
	texts,
	add_special_tokens=True,
	max_length=max_length,
	padding='max_length' if padding else False,
	truncation=truncation,
	return_attention_mask=True,
	return_tensors='tf'
	)
	return {
	'input_ids': encoded_batch['input_ids'].numpy().tolist(),
	'attention_mask': encoded_batch['attention_mask'].numpy().tolist()
	}

	def decode(self, token_ids):
	# Decodes token IDs back into text
	return self.tokenizer.decode(token_ids, skip_special_tokens=True)

	def save_pretrained(self, save_directory):
	# Save the tokenizer in Hugging Face format
	os.makedirs(save_directory, exist_ok=True)
	self.tokenizer.save_pretrained(save_directory)

	def __call__(self, text, args, *kwargs):
	"""
	This allows the tokenizer object to be called directly like `tokenizer(text)`.
	It will automatically detect if the input is a batch or a single sentence.
	"""
	return self.encode(text, args, *kwargs)


	# Example usage of the tokenizer
	tokenizer = MiniSunTokenizer()