Update convert_tokenizer.py
Browse files- convert_tokenizer.py +23 -9
convert_tokenizer.py
CHANGED
@@ -1,24 +1,37 @@
|
|
|
|
|
|
1 |
import json
|
2 |
-
import sentencepiece as spm
|
3 |
import os
|
4 |
import shutil
|
|
|
|
|
5 |
from transformers import AutoTokenizer
|
6 |
|
7 |
def convert_to_sentencepiece(input_dir, output_dir):
|
8 |
print(f"Converting tokenizer from {input_dir} to {output_dir}")
|
9 |
|
10 |
-
#
|
11 |
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
-
#
|
14 |
-
|
15 |
-
|
16 |
-
vocab = json.load(f)
|
17 |
|
18 |
# Create a temporary vocabulary file for SentencePiece
|
19 |
temp_vocab_file = os.path.join(output_dir, "temp_vocab.txt")
|
20 |
with open(temp_vocab_file, "w", encoding="utf-8") as f:
|
21 |
-
# Sort by token
|
22 |
for token, index in sorted(vocab.items(), key=lambda x: x[1]):
|
23 |
# SentencePiece expects tab-separated format: token<tab>score
|
24 |
f.write(f"{token}\t1.0\n")
|
@@ -38,7 +51,8 @@ def convert_to_sentencepiece(input_dir, output_dir):
|
|
38 |
eos_id=-1, # No end of sentence token
|
39 |
pad_id=-1, # No padding token
|
40 |
unk_id=0, # Unknown token ID
|
41 |
-
max_sentence_length=
|
|
|
42 |
)
|
43 |
|
44 |
# Clean up temporary file
|
@@ -61,7 +75,7 @@ def convert_to_sentencepiece(input_dir, output_dir):
|
|
61 |
print(f"SentencePiece tokenizer: {tokens_sp}")
|
62 |
|
63 |
if __name__ == "__main__":
|
64 |
-
input_dir = "/mnt/llm/models/phi-4/model"
|
65 |
output_dir = "/mnt/llm/models/phi-4/converted_tokenizer"
|
66 |
|
67 |
convert_to_sentencepiece(input_dir, output_dir)
|
|
|
1 |
+
# phi_create_tokenizer_model.py
|
2 |
+
# This script converts tokenizer.json to tokenizer.model and vocab.json to vocab.txt
|
3 |
import json
|
|
|
4 |
import os
|
5 |
import shutil
|
6 |
+
|
7 |
+
import sentencepiece as spm
|
8 |
from transformers import AutoTokenizer
|
9 |
|
10 |
def convert_to_sentencepiece(input_dir, output_dir):
|
11 |
print(f"Converting tokenizer from {input_dir} to {output_dir}")
|
12 |
|
13 |
+
# Ensure a working tokenizer by copying all files
|
14 |
os.makedirs(output_dir, exist_ok=True)
|
15 |
+
for filename in os.listdir(input_dir):
|
16 |
+
if filename.startswith("tokenizer"):
|
17 |
+
shutil.copyfile(os.path.join(input_dir, filename), os.path.join(output_dir, filename))
|
18 |
+
|
19 |
+
# Read tokenizer.json to get the vocabulary and added_tokens
|
20 |
+
tokenizer_path = os.path.join(input_dir, "tokenizer.json")
|
21 |
+
with open(tokenizer_path, 'r', encoding='utf-8') as f:
|
22 |
+
tokenizer_data = json.load(f)
|
23 |
+
|
24 |
+
vocab = tokenizer_data["model"]["vocab"]
|
25 |
+
added_tokens = tokenizer_data["added_tokens"]
|
26 |
|
27 |
+
# Add the added tokens to the vocabulary with their correct IDs
|
28 |
+
for token_data in added_tokens:
|
29 |
+
vocab[token_data["content"]] = token_data["id"]
|
|
|
30 |
|
31 |
# Create a temporary vocabulary file for SentencePiece
|
32 |
temp_vocab_file = os.path.join(output_dir, "temp_vocab.txt")
|
33 |
with open(temp_vocab_file, "w", encoding="utf-8") as f:
|
34 |
+
# Sort by token ID to maintain correct order
|
35 |
for token, index in sorted(vocab.items(), key=lambda x: x[1]):
|
36 |
# SentencePiece expects tab-separated format: token<tab>score
|
37 |
f.write(f"{token}\t1.0\n")
|
|
|
51 |
eos_id=-1, # No end of sentence token
|
52 |
pad_id=-1, # No padding token
|
53 |
unk_id=0, # Unknown token ID
|
54 |
+
max_sentence_length=131072, # Increased to 128K tokens for RoPE
|
55 |
+
num_threads=16 # Adjust based on your system's capabilities
|
56 |
)
|
57 |
|
58 |
# Clean up temporary file
|
|
|
75 |
print(f"SentencePiece tokenizer: {tokens_sp}")
|
76 |
|
77 |
if __name__ == "__main__":
|
78 |
+
input_dir = "/mnt/llm/models/phi-4/model"
|
79 |
output_dir = "/mnt/llm/models/phi-4/converted_tokenizer"
|
80 |
|
81 |
convert_to_sentencepiece(input_dir, output_dir)
|