smcleod commited on
Commit
e83cf6a
1 Parent(s): b3f0cc0

Update convert_tokenizer.py

Browse files
Files changed (1) hide show
  1. convert_tokenizer.py +23 -9
convert_tokenizer.py CHANGED
@@ -1,24 +1,37 @@
 
 
1
  import json
2
- import sentencepiece as spm
3
  import os
4
  import shutil
 
 
5
  from transformers import AutoTokenizer
6
 
7
  def convert_to_sentencepiece(input_dir, output_dir):
8
  print(f"Converting tokenizer from {input_dir} to {output_dir}")
9
 
10
- # First ensure we have a working tokenizer by copying all files
11
  os.makedirs(output_dir, exist_ok=True)
 
 
 
 
 
 
 
 
 
 
 
12
 
13
- # Read vocab.json to get the vocabulary
14
- vocab_path = os.path.join(input_dir, "vocab.json")
15
- with open(vocab_path, 'r', encoding='utf-8') as f:
16
- vocab = json.load(f)
17
 
18
  # Create a temporary vocabulary file for SentencePiece
19
  temp_vocab_file = os.path.join(output_dir, "temp_vocab.txt")
20
  with open(temp_vocab_file, "w", encoding="utf-8") as f:
21
- # Sort by token id to maintain correct order
22
  for token, index in sorted(vocab.items(), key=lambda x: x[1]):
23
  # SentencePiece expects tab-separated format: token<tab>score
24
  f.write(f"{token}\t1.0\n")
@@ -38,7 +51,8 @@ def convert_to_sentencepiece(input_dir, output_dir):
38
  eos_id=-1, # No end of sentence token
39
  pad_id=-1, # No padding token
40
  unk_id=0, # Unknown token ID
41
- max_sentence_length=16384
 
42
  )
43
 
44
  # Clean up temporary file
@@ -61,7 +75,7 @@ def convert_to_sentencepiece(input_dir, output_dir):
61
  print(f"SentencePiece tokenizer: {tokens_sp}")
62
 
63
  if __name__ == "__main__":
64
- input_dir = "/mnt/llm/models/phi-4/model" # or "model" depending on which directory you want to use
65
  output_dir = "/mnt/llm/models/phi-4/converted_tokenizer"
66
 
67
  convert_to_sentencepiece(input_dir, output_dir)
 
1
+ # phi_create_tokenizer_model.py
2
+ # This script converts tokenizer.json to tokenizer.model and vocab.json to vocab.txt
3
  import json
 
4
  import os
5
  import shutil
6
+
7
+ import sentencepiece as spm
8
  from transformers import AutoTokenizer
9
 
10
  def convert_to_sentencepiece(input_dir, output_dir):
11
  print(f"Converting tokenizer from {input_dir} to {output_dir}")
12
 
13
+ # Ensure a working tokenizer by copying all files
14
  os.makedirs(output_dir, exist_ok=True)
15
+ for filename in os.listdir(input_dir):
16
+ if filename.startswith("tokenizer"):
17
+ shutil.copyfile(os.path.join(input_dir, filename), os.path.join(output_dir, filename))
18
+
19
+ # Read tokenizer.json to get the vocabulary and added_tokens
20
+ tokenizer_path = os.path.join(input_dir, "tokenizer.json")
21
+ with open(tokenizer_path, 'r', encoding='utf-8') as f:
22
+ tokenizer_data = json.load(f)
23
+
24
+ vocab = tokenizer_data["model"]["vocab"]
25
+ added_tokens = tokenizer_data["added_tokens"]
26
 
27
+ # Add the added tokens to the vocabulary with their correct IDs
28
+ for token_data in added_tokens:
29
+ vocab[token_data["content"]] = token_data["id"]
 
30
 
31
  # Create a temporary vocabulary file for SentencePiece
32
  temp_vocab_file = os.path.join(output_dir, "temp_vocab.txt")
33
  with open(temp_vocab_file, "w", encoding="utf-8") as f:
34
+ # Sort by token ID to maintain correct order
35
  for token, index in sorted(vocab.items(), key=lambda x: x[1]):
36
  # SentencePiece expects tab-separated format: token<tab>score
37
  f.write(f"{token}\t1.0\n")
 
51
  eos_id=-1, # No end of sentence token
52
  pad_id=-1, # No padding token
53
  unk_id=0, # Unknown token ID
54
+ max_sentence_length=131072, # Increased to 128K tokens for RoPE
55
+ num_threads=16 # Adjust based on your system's capabilities
56
  )
57
 
58
  # Clean up temporary file
 
75
  print(f"SentencePiece tokenizer: {tokens_sp}")
76
 
77
  if __name__ == "__main__":
78
+ input_dir = "/mnt/llm/models/phi-4/model"
79
  output_dir = "/mnt/llm/models/phi-4/converted_tokenizer"
80
 
81
  convert_to_sentencepiece(input_dir, output_dir)