|
|
|
|
|
import json |
|
import os |
|
import shutil |
|
|
|
import sentencepiece as spm |
|
from transformers import AutoTokenizer |
|
|
|
def convert_to_sentencepiece(input_dir, output_dir): |
|
print(f"Converting tokenizer from {input_dir} to {output_dir}") |
|
|
|
|
|
os.makedirs(output_dir, exist_ok=True) |
|
for filename in os.listdir(input_dir): |
|
if filename.startswith("tokenizer"): |
|
shutil.copyfile(os.path.join(input_dir, filename), os.path.join(output_dir, filename)) |
|
|
|
|
|
tokenizer_path = os.path.join(input_dir, "tokenizer.json") |
|
with open(tokenizer_path, 'r', encoding='utf-8') as f: |
|
tokenizer_data = json.load(f) |
|
|
|
vocab = tokenizer_data["model"]["vocab"] |
|
added_tokens = tokenizer_data["added_tokens"] |
|
|
|
|
|
for token_data in added_tokens: |
|
vocab[token_data["content"]] = token_data["id"] |
|
|
|
|
|
temp_vocab_file = os.path.join(output_dir, "temp_vocab.txt") |
|
with open(temp_vocab_file, "w", encoding="utf-8") as f: |
|
|
|
for token, index in sorted(vocab.items(), key=lambda x: x[1]): |
|
|
|
f.write(f"{token}\t1.0\n") |
|
|
|
print("\nCreating SentencePiece model...") |
|
|
|
|
|
spm.SentencePieceTrainer.train( |
|
input=temp_vocab_file, |
|
model_prefix=os.path.join(output_dir, "tokenizer"), |
|
vocab_size=len(vocab), |
|
model_type='bpe', |
|
character_coverage=1.0, |
|
input_format='tsv', |
|
train_extremely_large_corpus=True, |
|
bos_id=-1, |
|
eos_id=-1, |
|
pad_id=-1, |
|
unk_id=0, |
|
max_sentence_length=131072, |
|
num_threads=16 |
|
) |
|
|
|
|
|
os.remove(temp_vocab_file) |
|
|
|
print("SentencePiece model created successfully") |
|
|
|
|
|
test_text = "Hello, world!" |
|
tokenizer = AutoTokenizer.from_pretrained(input_dir) |
|
tokens_orig = tokenizer.encode(test_text) |
|
|
|
|
|
sp = spm.SentencePieceProcessor() |
|
sp.load(os.path.join(output_dir, "tokenizer.model")) |
|
tokens_sp = sp.encode_as_ids(test_text) |
|
|
|
print("\nTokenizer comparison test:") |
|
print(f"Original tokenizer: {tokens_orig}") |
|
print(f"SentencePiece tokenizer: {tokens_sp}") |
|
|
|
if __name__ == "__main__": |
|
input_dir = "/mnt/llm/models/phi-4/model" |
|
output_dir = "/mnt/llm/models/phi-4/converted_tokenizer" |
|
|
|
convert_to_sentencepiece(input_dir, output_dir) |
|
|