smcleod
/

phi-4

Text Generation

Inference Endpoints

Model card Files Files and versions Community

phi-4 / convert_tokenizer.py

smcleod's picture

Upload 9 files

2d6ede2 verified 8 days ago

2.42 kB

	import json
	import sentencepiece as spm
	import os
	import shutil
	from transformers import AutoTokenizer

	def convert_to_sentencepiece(input_dir, output_dir):
	print(f"Converting tokenizer from {input_dir} to {output_dir}")

	# First ensure we have a working tokenizer by copying all files
	os.makedirs(output_dir, exist_ok=True)

	# Read vocab.json to get the vocabulary
	vocab_path = os.path.join(input_dir, "vocab.json")
	with open(vocab_path, 'r', encoding='utf-8') as f:
	vocab = json.load(f)

	# Create a temporary vocabulary file for SentencePiece
	temp_vocab_file = os.path.join(output_dir, "temp_vocab.txt")
	with open(temp_vocab_file, "w", encoding="utf-8") as f:
	# Sort by token id to maintain correct order
	for token, index in sorted(vocab.items(), key=lambda x: x[1]):
	# SentencePiece expects tab-separated format: token<tab>score
	f.write(f"{token}\t1.0\n")

	print("\nCreating SentencePiece model...")

	# Train the SentencePiece model using the vocabulary
	spm.SentencePieceTrainer.train(
	input=temp_vocab_file,
	model_prefix=os.path.join(output_dir, "tokenizer"),
	vocab_size=len(vocab),
	model_type='bpe',
	character_coverage=1.0,
	input_format='tsv',
	train_extremely_large_corpus=True,
	bos_id=-1, # No beginning of sentence token
	eos_id=-1, # No end of sentence token
	pad_id=-1, # No padding token
	unk_id=0, # Unknown token ID
	max_sentence_length=16384
	)

	# Clean up temporary file
	os.remove(temp_vocab_file)

	print("SentencePiece model created successfully")

	# Test the original tokenizer for comparison
	test_text = "Hello, world!"
	tokenizer = AutoTokenizer.from_pretrained(input_dir)
	tokens_orig = tokenizer.encode(test_text)

	# Test the SentencePiece model
	sp = spm.SentencePieceProcessor()
	sp.load(os.path.join(output_dir, "tokenizer.model"))
	tokens_sp = sp.encode_as_ids(test_text)

	print("\nTokenizer comparison test:")
	print(f"Original tokenizer: {tokens_orig}")
	print(f"SentencePiece tokenizer: {tokens_sp}")

	if __name__ == "__main__":
	input_dir = "/mnt/llm/models/phi-4/model" # or "model" depending on which directory you want to use
	output_dir = "/mnt/llm/models/phi-4/converted_tokenizer"

	convert_to_sentencepiece(input_dir, output_dir)