# phi_create_tokenizer_model.py # This script converts tokenizer.json to tokenizer.model and vocab.json to vocab.txt import json import os import shutil import sentencepiece as spm from transformers import AutoTokenizer def convert_to_sentencepiece(input_dir, output_dir): print(f"Converting tokenizer from {input_dir} to {output_dir}") # Ensure a working tokenizer by copying all files os.makedirs(output_dir, exist_ok=True) for filename in os.listdir(input_dir): if filename.startswith("tokenizer"): shutil.copyfile(os.path.join(input_dir, filename), os.path.join(output_dir, filename)) # Read tokenizer.json to get the vocabulary and added_tokens tokenizer_path = os.path.join(input_dir, "tokenizer.json") with open(tokenizer_path, 'r', encoding='utf-8') as f: tokenizer_data = json.load(f) vocab = tokenizer_data["model"]["vocab"] added_tokens = tokenizer_data["added_tokens"] # Add the added tokens to the vocabulary with their correct IDs for token_data in added_tokens: vocab[token_data["content"]] = token_data["id"] # Create a temporary vocabulary file for SentencePiece temp_vocab_file = os.path.join(output_dir, "temp_vocab.txt") with open(temp_vocab_file, "w", encoding="utf-8") as f: # Sort by token ID to maintain correct order for token, index in sorted(vocab.items(), key=lambda x: x[1]): # SentencePiece expects tab-separated format: tokenscore f.write(f"{token}\t1.0\n") print("\nCreating SentencePiece model...") # Train the SentencePiece model using the vocabulary spm.SentencePieceTrainer.train( input=temp_vocab_file, model_prefix=os.path.join(output_dir, "tokenizer"), vocab_size=len(vocab), model_type='bpe', character_coverage=1.0, input_format='tsv', train_extremely_large_corpus=True, bos_id=-1, # No beginning of sentence token eos_id=-1, # No end of sentence token pad_id=-1, # No padding token unk_id=0, # Unknown token ID max_sentence_length=131072, # Increased to 128K tokens for RoPE num_threads=16 # Adjust based on your system's capabilities ) # Clean up temporary file os.remove(temp_vocab_file) print("SentencePiece model created successfully") # Test the original tokenizer for comparison test_text = "Hello, world!" tokenizer = AutoTokenizer.from_pretrained(input_dir) tokens_orig = tokenizer.encode(test_text) # Test the SentencePiece model sp = spm.SentencePieceProcessor() sp.load(os.path.join(output_dir, "tokenizer.model")) tokens_sp = sp.encode_as_ids(test_text) print("\nTokenizer comparison test:") print(f"Original tokenizer: {tokens_orig}") print(f"SentencePiece tokenizer: {tokens_sp}") if __name__ == "__main__": input_dir = "/mnt/llm/models/phi-4/model" output_dir = "/mnt/llm/models/phi-4/converted_tokenizer" convert_to_sentencepiece(input_dir, output_dir)