phi-4 / convert_tokenizer.py
smcleod's picture
Update convert_tokenizer.py
f14551e verified
from transformers import AutoTokenizer
import json
import os
import shutil
def safe_read_json(filepath):
try:
with open(filepath, 'r', encoding='utf-8') as f:
return json.load(f)
except FileNotFoundError:
print(f"File not found: {filepath}")
return None
except Exception as e:
print(f"Error reading {filepath}: {str(e)}")
return None
def safe_copy_file(src, dst):
try:
shutil.copy2(src, dst)
print(f"Successfully copied {os.path.basename(src)}")
return True
except Exception as e:
print(f"Error copying {src}: {str(e)}")
return False
def convert_phi_tokenizer(input_dir, output_dir):
print(f"Converting tokenizer from {input_dir} to {output_dir}")
# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)
# JSON files to process
json_files = [
'tokenizer.json',
'tokenizer_config.json',
'special_tokens_map.json',
'added_tokens.json' # Moved added_tokens.json here
]
# Files to copy directly (no JSON parsing)
copy_files = [
'merges.txt'
]
# List what files we actually find
print("\nFound files:")
for f in os.listdir(input_dir):
print(f"- {f}")
# Process JSON files
for filename in json_files:
input_path = os.path.join(input_dir, filename)
if os.path.exists(input_path):
print(f"\nProcessing {filename}")
content = safe_read_json(input_path)
if content is not None:
output_path = os.path.join(output_dir, filename)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(content, f, indent=2)
print(f"Successfully copied {filename}")
# Copy non-JSON files directly
for filename in copy_files:
input_path = os.path.join(input_dir, filename)
if os.path.exists(input_path):
print(f"\nCopying {filename}")
safe_copy_file(input_path, os.path.join(output_dir, filename))
# Load and modify the tokenizer config
config_path = os.path.join(input_dir, 'tokenizer_config.json')
if os.path.exists(config_path):
print("\nProcessing tokenizer config")
config = safe_read_json(config_path)
if config is not None:
config.update({
'add_prefix_space': False,
'clean_up_tokenization_spaces': False,
'model_max_length': 16384,
'tokenizer_class': 'GPT2Tokenizer', # Changed to GPT2Tokenizer
'bos_token': '<|endoftext|>',
'eos_token': '<|endoftext|>',
'pad_token': '<|endoftext|>'
})
# Save the modified config
output_config_path = os.path.join(output_dir, 'tokenizer_config.json')
with open(output_config_path, 'w') as f:
json.dump(config, f, indent=2)
print("Successfully updated config")
# Construct the vocabulary with added tokens
print("\nConstructing vocabulary...")
tokenizer_path = os.path.join(output_dir, "tokenizer.json")
tokenizer_data = safe_read_json(tokenizer_path)
if tokenizer_data is None:
print("Error: Unable to read tokenizer.json")
return
vocab = tokenizer_data["model"]["vocab"]
added_tokens = tokenizer_data.get("added_tokens", [])
for token_data in added_tokens:
content = token_data["content"]
if content not in vocab:
vocab[content] = token_data["id"]
vocab_size = len(vocab)
print(f"Vocabulary size: {vocab_size}")
# Save the vocabulary as vocab.json
vocab_output_path = os.path.join(output_dir, "vocab.json")
with open(vocab_output_path, 'w', encoding='utf-8') as f:
json.dump(vocab, f, indent=2)
print(f"Successfully saved vocabulary to {vocab_output_path}")
print("\nAttempting to test tokenizer...")
try:
tokenizer = AutoTokenizer.from_pretrained(output_dir)
test_text = "Hello, world!"
tokens = tokenizer.encode(test_text)
decoded = tokenizer.decode(tokens)
print("Tokenizer test successful!")
print(f"Test text: {test_text}")
print(f"Encoded: {tokens}")
print(f"Decoded: {decoded}")
# check if they're the same
if test_text != decoded:
print("Decoded text does not match original text!")
else:
print("Decoded text matches original text!")
# save the tokenizer
tokenizer.save_pretrained(output_dir)
print(f"Tokenizer saved to {output_dir}")
except Exception as e:
print(f"Error testing tokenizer: {e}")
if __name__ == "__main__":
input_dir = "/mnt/llm/models/phi-4/model" # or "model" depending on which directory you want to use
output_dir = "/mnt/llm/models/phi-4/converted_tokenizer"
convert_phi_tokenizer(input_dir, output_dir)