smcleod
/

phi-4

@@ -1,24 +1,37 @@
 import json
-import sentencepiece as spm
 import os
 import shutil
 from transformers import AutoTokenizer
 def convert_to_sentencepiece(input_dir, output_dir):
     print(f"Converting tokenizer from {input_dir} to {output_dir}")
-    # First ensure we have a working tokenizer by copying all files
     os.makedirs(output_dir, exist_ok=True)
-    # Read vocab.json to get the vocabulary
-    vocab_path = os.path.join(input_dir, "vocab.json")
-    with open(vocab_path, 'r', encoding='utf-8') as f:
-        vocab = json.load(f)
     # Create a temporary vocabulary file for SentencePiece
     temp_vocab_file = os.path.join(output_dir, "temp_vocab.txt")
     with open(temp_vocab_file, "w", encoding="utf-8") as f:
-        # Sort by token id to maintain correct order
         for token, index in sorted(vocab.items(), key=lambda x: x[1]):
             # SentencePiece expects tab-separated format: token<tab>score
             f.write(f"{token}\t1.0\n")
@@ -38,7 +51,8 @@ def convert_to_sentencepiece(input_dir, output_dir):
         eos_id=-1,  # No end of sentence token
         pad_id=-1,  # No padding token
         unk_id=0,   # Unknown token ID
-        max_sentence_length=16384
     )
     # Clean up temporary file
@@ -61,7 +75,7 @@ def convert_to_sentencepiece(input_dir, output_dir):
     print(f"SentencePiece tokenizer: {tokens_sp}")
 if __name__ == "__main__":
-    input_dir = "/mnt/llm/models/phi-4/model"  # or "model" depending on which directory you want to use
     output_dir = "/mnt/llm/models/phi-4/converted_tokenizer"
     convert_to_sentencepiece(input_dir, output_dir)

+# phi_create_tokenizer_model.py
+# This script converts tokenizer.json to tokenizer.model and vocab.json to vocab.txt
 import json
 import os
 import shutil
+import sentencepiece as spm
 from transformers import AutoTokenizer
 def convert_to_sentencepiece(input_dir, output_dir):
     print(f"Converting tokenizer from {input_dir} to {output_dir}")
+    # Ensure a working tokenizer by copying all files
     os.makedirs(output_dir, exist_ok=True)
+    for filename in os.listdir(input_dir):
+        if filename.startswith("tokenizer"):
+            shutil.copyfile(os.path.join(input_dir, filename), os.path.join(output_dir, filename))
+    # Read tokenizer.json to get the vocabulary and added_tokens
+    tokenizer_path = os.path.join(input_dir, "tokenizer.json")
+    with open(tokenizer_path, 'r', encoding='utf-8') as f:
+        tokenizer_data = json.load(f)
+    vocab = tokenizer_data["model"]["vocab"]
+    added_tokens = tokenizer_data["added_tokens"]
+    # Add the added tokens to the vocabulary with their correct IDs
+    for token_data in added_tokens:
+        vocab[token_data["content"]] = token_data["id"]
     # Create a temporary vocabulary file for SentencePiece
     temp_vocab_file = os.path.join(output_dir, "temp_vocab.txt")
     with open(temp_vocab_file, "w", encoding="utf-8") as f:
+        # Sort by token ID to maintain correct order
         for token, index in sorted(vocab.items(), key=lambda x: x[1]):
             # SentencePiece expects tab-separated format: token<tab>score
             f.write(f"{token}\t1.0\n")
         eos_id=-1,  # No end of sentence token
         pad_id=-1,  # No padding token
         unk_id=0,   # Unknown token ID
+        max_sentence_length=131072,  # Increased to 128K tokens for RoPE
+        num_threads=16  # Adjust based on your system's capabilities
     )
     # Clean up temporary file
     print(f"SentencePiece tokenizer: {tokens_sp}")
 if __name__ == "__main__":
+    input_dir = "/mnt/llm/models/phi-4/model"
     output_dir = "/mnt/llm/models/phi-4/converted_tokenizer"
     convert_to_sentencepiece(input_dir, output_dir)