phi-4 / convert_tokenizer.py

Update convert_tokenizer.py

f14551e verified 7 days ago

5 kB

	from transformers import AutoTokenizer
	import json
	import os
	import shutil

	def safe_read_json(filepath):
	try:
	with open(filepath, 'r', encoding='utf-8') as f:
	return json.load(f)
	except FileNotFoundError:
	print(f"File not found: {filepath}")
	return None
	except Exception as e:
	print(f"Error reading {filepath}: {str(e)}")
	return None

	def safe_copy_file(src, dst):
	try:
	shutil.copy2(src, dst)
	print(f"Successfully copied {os.path.basename(src)}")
	return True
	except Exception as e:
	print(f"Error copying {src}: {str(e)}")
	return False

	def convert_phi_tokenizer(input_dir, output_dir):
	print(f"Converting tokenizer from {input_dir} to {output_dir}")

	# Ensure output directory exists
	os.makedirs(output_dir, exist_ok=True)

	# JSON files to process
	json_files = [
	'tokenizer.json',
	'tokenizer_config.json',
	'special_tokens_map.json',
	'added_tokens.json' # Moved added_tokens.json here
	]

	# Files to copy directly (no JSON parsing)
	copy_files = [
	'merges.txt'
	]

	# List what files we actually find
	print("\nFound files:")
	for f in os.listdir(input_dir):
	print(f"- {f}")

	# Process JSON files
	for filename in json_files:
	input_path = os.path.join(input_dir, filename)
	if os.path.exists(input_path):
	print(f"\nProcessing {filename}")
	content = safe_read_json(input_path)
	if content is not None:
	output_path = os.path.join(output_dir, filename)
	with open(output_path, 'w', encoding='utf-8') as f:
	json.dump(content, f, indent=2)
	print(f"Successfully copied {filename}")

	# Copy non-JSON files directly
	for filename in copy_files:
	input_path = os.path.join(input_dir, filename)
	if os.path.exists(input_path):
	print(f"\nCopying {filename}")
	safe_copy_file(input_path, os.path.join(output_dir, filename))

	# Load and modify the tokenizer config
	config_path = os.path.join(input_dir, 'tokenizer_config.json')
	if os.path.exists(config_path):
	print("\nProcessing tokenizer config")
	config = safe_read_json(config_path)
	if config is not None:
	config.update({
	'add_prefix_space': False,
	'clean_up_tokenization_spaces': False,
	'model_max_length': 16384,
	'tokenizer_class': 'GPT2Tokenizer', # Changed to GPT2Tokenizer
	'bos_token': '<\|endoftext\|>',
	'eos_token': '<\|endoftext\|>',
	'pad_token': '<\|endoftext\|>'
	})

	# Save the modified config
	output_config_path = os.path.join(output_dir, 'tokenizer_config.json')
	with open(output_config_path, 'w') as f:
	json.dump(config, f, indent=2)
	print("Successfully updated config")

	# Construct the vocabulary with added tokens
	print("\nConstructing vocabulary...")
	tokenizer_path = os.path.join(output_dir, "tokenizer.json")
	tokenizer_data = safe_read_json(tokenizer_path)
	if tokenizer_data is None:
	print("Error: Unable to read tokenizer.json")
	return

	vocab = tokenizer_data["model"]["vocab"]
	added_tokens = tokenizer_data.get("added_tokens", [])

	for token_data in added_tokens:
	content = token_data["content"]
	if content not in vocab:
	vocab[content] = token_data["id"]

	vocab_size = len(vocab)
	print(f"Vocabulary size: {vocab_size}")

	# Save the vocabulary as vocab.json
	vocab_output_path = os.path.join(output_dir, "vocab.json")
	with open(vocab_output_path, 'w', encoding='utf-8') as f:
	json.dump(vocab, f, indent=2)
	print(f"Successfully saved vocabulary to {vocab_output_path}")

	print("\nAttempting to test tokenizer...")
	try:
	tokenizer = AutoTokenizer.from_pretrained(output_dir)
	test_text = "Hello, world!"
	tokens = tokenizer.encode(test_text)
	decoded = tokenizer.decode(tokens)
	print("Tokenizer test successful!")
	print(f"Test text: {test_text}")
	print(f"Encoded: {tokens}")
	print(f"Decoded: {decoded}")

	# check if they're the same
	if test_text != decoded:
	print("Decoded text does not match original text!")
	else:
	print("Decoded text matches original text!")
	# save the tokenizer
	tokenizer.save_pretrained(output_dir)
	print(f"Tokenizer saved to {output_dir}")

	except Exception as e:
	print(f"Error testing tokenizer: {e}")

	if __name__ == "__main__":
	input_dir = "/mnt/llm/models/phi-4/model" # or "model" depending on which directory you want to use
	output_dir = "/mnt/llm/models/phi-4/converted_tokenizer"

	convert_phi_tokenizer(input_dir, output_dir)