smcleod
/

phi-4

Text Generation

Inference Endpoints

Model card Files Files and versions Community

phi-4 / convert_tokenizer.py

smcleod's picture

Update convert_tokenizer.py

e83cf6a verified about 1 month ago

3.06 kB

	# phi_create_tokenizer_model.py
	# This script converts tokenizer.json to tokenizer.model and vocab.json to vocab.txt
	import json
	import os
	import shutil

	import sentencepiece as spm
	from transformers import AutoTokenizer

	def convert_to_sentencepiece(input_dir, output_dir):
	print(f"Converting tokenizer from {input_dir} to {output_dir}")

	# Ensure a working tokenizer by copying all files
	os.makedirs(output_dir, exist_ok=True)
	for filename in os.listdir(input_dir):
	if filename.startswith("tokenizer"):
	shutil.copyfile(os.path.join(input_dir, filename), os.path.join(output_dir, filename))

	# Read tokenizer.json to get the vocabulary and added_tokens
	tokenizer_path = os.path.join(input_dir, "tokenizer.json")
	with open(tokenizer_path, 'r', encoding='utf-8') as f:
	tokenizer_data = json.load(f)

	vocab = tokenizer_data["model"]["vocab"]
	added_tokens = tokenizer_data["added_tokens"]

	# Add the added tokens to the vocabulary with their correct IDs
	for token_data in added_tokens:
	vocab[token_data["content"]] = token_data["id"]

	# Create a temporary vocabulary file for SentencePiece
	temp_vocab_file = os.path.join(output_dir, "temp_vocab.txt")
	with open(temp_vocab_file, "w", encoding="utf-8") as f:
	# Sort by token ID to maintain correct order
	for token, index in sorted(vocab.items(), key=lambda x: x[1]):
	# SentencePiece expects tab-separated format: token<tab>score
	f.write(f"{token}\t1.0\n")

	print("\nCreating SentencePiece model...")

	# Train the SentencePiece model using the vocabulary
	spm.SentencePieceTrainer.train(
	input=temp_vocab_file,
	model_prefix=os.path.join(output_dir, "tokenizer"),
	vocab_size=len(vocab),
	model_type='bpe',
	character_coverage=1.0,
	input_format='tsv',
	train_extremely_large_corpus=True,
	bos_id=-1, # No beginning of sentence token
	eos_id=-1, # No end of sentence token
	pad_id=-1, # No padding token
	unk_id=0, # Unknown token ID
	max_sentence_length=131072, # Increased to 128K tokens for RoPE
	num_threads=16 # Adjust based on your system's capabilities
	)

	# Clean up temporary file
	os.remove(temp_vocab_file)

	print("SentencePiece model created successfully")

	# Test the original tokenizer for comparison
	test_text = "Hello, world!"
	tokenizer = AutoTokenizer.from_pretrained(input_dir)
	tokens_orig = tokenizer.encode(test_text)

	# Test the SentencePiece model
	sp = spm.SentencePieceProcessor()
	sp.load(os.path.join(output_dir, "tokenizer.model"))
	tokens_sp = sp.encode_as_ids(test_text)

	print("\nTokenizer comparison test:")
	print(f"Original tokenizer: {tokens_orig}")
	print(f"SentencePiece tokenizer: {tokens_sp}")

	if __name__ == "__main__":
	input_dir = "/mnt/llm/models/phi-4/model"
	output_dir = "/mnt/llm/models/phi-4/converted_tokenizer"

	convert_to_sentencepiece(input_dir, output_dir)