phi-4 / convert_tokenizer.py
smcleod's picture
Update convert_tokenizer.py
e83cf6a verified
raw
history blame
3.06 kB
# phi_create_tokenizer_model.py
# This script converts tokenizer.json to tokenizer.model and vocab.json to vocab.txt
import json
import os
import shutil
import sentencepiece as spm
from transformers import AutoTokenizer
def convert_to_sentencepiece(input_dir, output_dir):
print(f"Converting tokenizer from {input_dir} to {output_dir}")
# Ensure a working tokenizer by copying all files
os.makedirs(output_dir, exist_ok=True)
for filename in os.listdir(input_dir):
if filename.startswith("tokenizer"):
shutil.copyfile(os.path.join(input_dir, filename), os.path.join(output_dir, filename))
# Read tokenizer.json to get the vocabulary and added_tokens
tokenizer_path = os.path.join(input_dir, "tokenizer.json")
with open(tokenizer_path, 'r', encoding='utf-8') as f:
tokenizer_data = json.load(f)
vocab = tokenizer_data["model"]["vocab"]
added_tokens = tokenizer_data["added_tokens"]
# Add the added tokens to the vocabulary with their correct IDs
for token_data in added_tokens:
vocab[token_data["content"]] = token_data["id"]
# Create a temporary vocabulary file for SentencePiece
temp_vocab_file = os.path.join(output_dir, "temp_vocab.txt")
with open(temp_vocab_file, "w", encoding="utf-8") as f:
# Sort by token ID to maintain correct order
for token, index in sorted(vocab.items(), key=lambda x: x[1]):
# SentencePiece expects tab-separated format: token<tab>score
f.write(f"{token}\t1.0\n")
print("\nCreating SentencePiece model...")
# Train the SentencePiece model using the vocabulary
spm.SentencePieceTrainer.train(
input=temp_vocab_file,
model_prefix=os.path.join(output_dir, "tokenizer"),
vocab_size=len(vocab),
model_type='bpe',
character_coverage=1.0,
input_format='tsv',
train_extremely_large_corpus=True,
bos_id=-1, # No beginning of sentence token
eos_id=-1, # No end of sentence token
pad_id=-1, # No padding token
unk_id=0, # Unknown token ID
max_sentence_length=131072, # Increased to 128K tokens for RoPE
num_threads=16 # Adjust based on your system's capabilities
)
# Clean up temporary file
os.remove(temp_vocab_file)
print("SentencePiece model created successfully")
# Test the original tokenizer for comparison
test_text = "Hello, world!"
tokenizer = AutoTokenizer.from_pretrained(input_dir)
tokens_orig = tokenizer.encode(test_text)
# Test the SentencePiece model
sp = spm.SentencePieceProcessor()
sp.load(os.path.join(output_dir, "tokenizer.model"))
tokens_sp = sp.encode_as_ids(test_text)
print("\nTokenizer comparison test:")
print(f"Original tokenizer: {tokens_orig}")
print(f"SentencePiece tokenizer: {tokens_sp}")
if __name__ == "__main__":
input_dir = "/mnt/llm/models/phi-4/model"
output_dir = "/mnt/llm/models/phi-4/converted_tokenizer"
convert_to_sentencepiece(input_dir, output_dir)