Update convert_tokenizer.py
Browse files- convert_tokenizer.py +29 -5
convert_tokenizer.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
from transformers import
|
2 |
import json
|
3 |
import os
|
4 |
import shutil
|
@@ -34,8 +34,7 @@ def convert_phi_tokenizer(input_dir, output_dir):
|
|
34 |
'tokenizer.json',
|
35 |
'tokenizer_config.json',
|
36 |
'special_tokens_map.json',
|
37 |
-
'
|
38 |
-
'added_tokens.json'
|
39 |
]
|
40 |
|
41 |
# Files to copy directly (no JSON parsing)
|
@@ -77,7 +76,7 @@ def convert_phi_tokenizer(input_dir, output_dir):
|
|
77 |
'add_prefix_space': False,
|
78 |
'clean_up_tokenization_spaces': False,
|
79 |
'model_max_length': 16384,
|
80 |
-
'tokenizer_class': 'GPT2Tokenizer',
|
81 |
'bos_token': '<|endoftext|>',
|
82 |
'eos_token': '<|endoftext|>',
|
83 |
'pad_token': '<|endoftext|>'
|
@@ -89,6 +88,31 @@ def convert_phi_tokenizer(input_dir, output_dir):
|
|
89 |
json.dump(config, f, indent=2)
|
90 |
print("Successfully updated config")
|
91 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
print("\nAttempting to test tokenizer...")
|
93 |
try:
|
94 |
tokenizer = AutoTokenizer.from_pretrained(output_dir)
|
@@ -99,7 +123,7 @@ def convert_phi_tokenizer(input_dir, output_dir):
|
|
99 |
print(f"Test text: {test_text}")
|
100 |
print(f"Encoded: {tokens}")
|
101 |
print(f"Decoded: {decoded}")
|
102 |
-
|
103 |
# check if they're the same
|
104 |
if test_text != decoded:
|
105 |
print("Decoded text does not match original text!")
|
|
|
1 |
+
from transformers import AutoTokenizer
|
2 |
import json
|
3 |
import os
|
4 |
import shutil
|
|
|
34 |
'tokenizer.json',
|
35 |
'tokenizer_config.json',
|
36 |
'special_tokens_map.json',
|
37 |
+
'added_tokens.json' # Moved added_tokens.json here
|
|
|
38 |
]
|
39 |
|
40 |
# Files to copy directly (no JSON parsing)
|
|
|
76 |
'add_prefix_space': False,
|
77 |
'clean_up_tokenization_spaces': False,
|
78 |
'model_max_length': 16384,
|
79 |
+
'tokenizer_class': 'GPT2Tokenizer', # Changed to GPT2Tokenizer
|
80 |
'bos_token': '<|endoftext|>',
|
81 |
'eos_token': '<|endoftext|>',
|
82 |
'pad_token': '<|endoftext|>'
|
|
|
88 |
json.dump(config, f, indent=2)
|
89 |
print("Successfully updated config")
|
90 |
|
91 |
+
# Construct the vocabulary with added tokens
|
92 |
+
print("\nConstructing vocabulary...")
|
93 |
+
tokenizer_path = os.path.join(output_dir, "tokenizer.json")
|
94 |
+
tokenizer_data = safe_read_json(tokenizer_path)
|
95 |
+
if tokenizer_data is None:
|
96 |
+
print("Error: Unable to read tokenizer.json")
|
97 |
+
return
|
98 |
+
|
99 |
+
vocab = tokenizer_data["model"]["vocab"]
|
100 |
+
added_tokens = tokenizer_data.get("added_tokens", [])
|
101 |
+
|
102 |
+
for token_data in added_tokens:
|
103 |
+
content = token_data["content"]
|
104 |
+
if content not in vocab:
|
105 |
+
vocab[content] = token_data["id"]
|
106 |
+
|
107 |
+
vocab_size = len(vocab)
|
108 |
+
print(f"Vocabulary size: {vocab_size}")
|
109 |
+
|
110 |
+
# Save the vocabulary as vocab.json
|
111 |
+
vocab_output_path = os.path.join(output_dir, "vocab.json")
|
112 |
+
with open(vocab_output_path, 'w', encoding='utf-8') as f:
|
113 |
+
json.dump(vocab, f, indent=2)
|
114 |
+
print(f"Successfully saved vocabulary to {vocab_output_path}")
|
115 |
+
|
116 |
print("\nAttempting to test tokenizer...")
|
117 |
try:
|
118 |
tokenizer = AutoTokenizer.from_pretrained(output_dir)
|
|
|
123 |
print(f"Test text: {test_text}")
|
124 |
print(f"Encoded: {tokens}")
|
125 |
print(f"Decoded: {decoded}")
|
126 |
+
|
127 |
# check if they're the same
|
128 |
if test_text != decoded:
|
129 |
print("Decoded text does not match original text!")
|