phi-4 / convert_tokenizer.py
smcleod's picture
Upload 9 files
2d6ede2 verified
raw
history blame
2.42 kB
import json
import sentencepiece as spm
import os
import shutil
from transformers import AutoTokenizer
def convert_to_sentencepiece(input_dir, output_dir):
print(f"Converting tokenizer from {input_dir} to {output_dir}")
# First ensure we have a working tokenizer by copying all files
os.makedirs(output_dir, exist_ok=True)
# Read vocab.json to get the vocabulary
vocab_path = os.path.join(input_dir, "vocab.json")
with open(vocab_path, 'r', encoding='utf-8') as f:
vocab = json.load(f)
# Create a temporary vocabulary file for SentencePiece
temp_vocab_file = os.path.join(output_dir, "temp_vocab.txt")
with open(temp_vocab_file, "w", encoding="utf-8") as f:
# Sort by token id to maintain correct order
for token, index in sorted(vocab.items(), key=lambda x: x[1]):
# SentencePiece expects tab-separated format: token<tab>score
f.write(f"{token}\t1.0\n")
print("\nCreating SentencePiece model...")
# Train the SentencePiece model using the vocabulary
spm.SentencePieceTrainer.train(
input=temp_vocab_file,
model_prefix=os.path.join(output_dir, "tokenizer"),
vocab_size=len(vocab),
model_type='bpe',
character_coverage=1.0,
input_format='tsv',
train_extremely_large_corpus=True,
bos_id=-1, # No beginning of sentence token
eos_id=-1, # No end of sentence token
pad_id=-1, # No padding token
unk_id=0, # Unknown token ID
max_sentence_length=16384
)
# Clean up temporary file
os.remove(temp_vocab_file)
print("SentencePiece model created successfully")
# Test the original tokenizer for comparison
test_text = "Hello, world!"
tokenizer = AutoTokenizer.from_pretrained(input_dir)
tokens_orig = tokenizer.encode(test_text)
# Test the SentencePiece model
sp = spm.SentencePieceProcessor()
sp.load(os.path.join(output_dir, "tokenizer.model"))
tokens_sp = sp.encode_as_ids(test_text)
print("\nTokenizer comparison test:")
print(f"Original tokenizer: {tokens_orig}")
print(f"SentencePiece tokenizer: {tokens_sp}")
if __name__ == "__main__":
input_dir = "/mnt/llm/models/phi-4/model" # or "model" depending on which directory you want to use
output_dir = "/mnt/llm/models/phi-4/converted_tokenizer"
convert_to_sentencepiece(input_dir, output_dir)