import json import sentencepiece as spm import os import shutil from transformers import AutoTokenizer def convert_to_sentencepiece(input_dir, output_dir): print(f"Converting tokenizer from {input_dir} to {output_dir}") # First ensure we have a working tokenizer by copying all files os.makedirs(output_dir, exist_ok=True) # Read vocab.json to get the vocabulary vocab_path = os.path.join(input_dir, "vocab.json") with open(vocab_path, 'r', encoding='utf-8') as f: vocab = json.load(f) # Create a temporary vocabulary file for SentencePiece temp_vocab_file = os.path.join(output_dir, "temp_vocab.txt") with open(temp_vocab_file, "w", encoding="utf-8") as f: # Sort by token id to maintain correct order for token, index in sorted(vocab.items(), key=lambda x: x[1]): # SentencePiece expects tab-separated format: tokenscore f.write(f"{token}\t1.0\n") print("\nCreating SentencePiece model...") # Train the SentencePiece model using the vocabulary spm.SentencePieceTrainer.train( input=temp_vocab_file, model_prefix=os.path.join(output_dir, "tokenizer"), vocab_size=len(vocab), model_type='bpe', character_coverage=1.0, input_format='tsv', train_extremely_large_corpus=True, bos_id=-1, # No beginning of sentence token eos_id=-1, # No end of sentence token pad_id=-1, # No padding token unk_id=0, # Unknown token ID max_sentence_length=16384 ) # Clean up temporary file os.remove(temp_vocab_file) print("SentencePiece model created successfully") # Test the original tokenizer for comparison test_text = "Hello, world!" tokenizer = AutoTokenizer.from_pretrained(input_dir) tokens_orig = tokenizer.encode(test_text) # Test the SentencePiece model sp = spm.SentencePieceProcessor() sp.load(os.path.join(output_dir, "tokenizer.model")) tokens_sp = sp.encode_as_ids(test_text) print("\nTokenizer comparison test:") print(f"Original tokenizer: {tokens_orig}") print(f"SentencePiece tokenizer: {tokens_sp}") if __name__ == "__main__": input_dir = "/mnt/llm/models/phi-4/model" # or "model" depending on which directory you want to use output_dir = "/mnt/llm/models/phi-4/converted_tokenizer" convert_to_sentencepiece(input_dir, output_dir)