smcleod
/

phi-4

@@ -1,4 +1,4 @@
-from transformers import PreTrainedTokenizerFast, AutoTokenizer
 import json
 import os
 import shutil
@@ -34,8 +34,7 @@ def convert_phi_tokenizer(input_dir, output_dir):
         'tokenizer.json',
         'tokenizer_config.json',
         'special_tokens_map.json',
-        'vocab.json',
-        'added_tokens.json'
     ]
     # Files to copy directly (no JSON parsing)
@@ -77,7 +76,7 @@ def convert_phi_tokenizer(input_dir, output_dir):
                 'add_prefix_space': False,
                 'clean_up_tokenization_spaces': False,
                 'model_max_length': 16384,
-                'tokenizer_class': 'GPT2Tokenizer',
                 'bos_token': '<|endoftext|>',
                 'eos_token': '<|endoftext|>',
                 'pad_token': '<|endoftext|>'
@@ -89,6 +88,31 @@ def convert_phi_tokenizer(input_dir, output_dir):
                 json.dump(config, f, indent=2)
             print("Successfully updated config")
     print("\nAttempting to test tokenizer...")
     try:
         tokenizer = AutoTokenizer.from_pretrained(output_dir)
@@ -99,7 +123,7 @@ def convert_phi_tokenizer(input_dir, output_dir):
         print(f"Test text: {test_text}")
         print(f"Encoded: {tokens}")
         print(f"Decoded: {decoded}")
         # check if they're the same
         if test_text != decoded:
             print("Decoded text does not match original text!")

+from transformers import AutoTokenizer
 import json
 import os
 import shutil
         'tokenizer.json',
         'tokenizer_config.json',
         'special_tokens_map.json',
+        'added_tokens.json'  # Moved added_tokens.json here
     ]
     # Files to copy directly (no JSON parsing)
                 'add_prefix_space': False,
                 'clean_up_tokenization_spaces': False,
                 'model_max_length': 16384,
+                'tokenizer_class': 'GPT2Tokenizer',  # Changed to GPT2Tokenizer
                 'bos_token': '<|endoftext|>',
                 'eos_token': '<|endoftext|>',
                 'pad_token': '<|endoftext|>'
                 json.dump(config, f, indent=2)
             print("Successfully updated config")
+    # Construct the vocabulary with added tokens
+    print("\nConstructing vocabulary...")
+    tokenizer_path = os.path.join(output_dir, "tokenizer.json")
+    tokenizer_data = safe_read_json(tokenizer_path)
+    if tokenizer_data is None:
+        print("Error: Unable to read tokenizer.json")
+        return
+    vocab = tokenizer_data["model"]["vocab"]
+    added_tokens = tokenizer_data.get("added_tokens", [])
+    for token_data in added_tokens:
+        content = token_data["content"]
+        if content not in vocab:
+            vocab[content] = token_data["id"]
+    vocab_size = len(vocab)
+    print(f"Vocabulary size: {vocab_size}")
+    # Save the vocabulary as vocab.json
+    vocab_output_path = os.path.join(output_dir, "vocab.json")
+    with open(vocab_output_path, 'w', encoding='utf-8') as f:
+        json.dump(vocab, f, indent=2)
+    print(f"Successfully saved vocabulary to {vocab_output_path}")
     print("\nAttempting to test tokenizer...")
     try:
         tokenizer = AutoTokenizer.from_pretrained(output_dir)
         print(f"Test text: {test_text}")
         print(f"Encoded: {tokens}")
         print(f"Decoded: {decoded}")
         # check if they're the same
         if test_text != decoded:
             print("Decoded text does not match original text!")