|
from transformers import AutoTokenizer |
|
import json |
|
import os |
|
import shutil |
|
|
|
def safe_read_json(filepath): |
|
try: |
|
with open(filepath, 'r', encoding='utf-8') as f: |
|
return json.load(f) |
|
except FileNotFoundError: |
|
print(f"File not found: {filepath}") |
|
return None |
|
except Exception as e: |
|
print(f"Error reading {filepath}: {str(e)}") |
|
return None |
|
|
|
def safe_copy_file(src, dst): |
|
try: |
|
shutil.copy2(src, dst) |
|
print(f"Successfully copied {os.path.basename(src)}") |
|
return True |
|
except Exception as e: |
|
print(f"Error copying {src}: {str(e)}") |
|
return False |
|
|
|
def convert_phi_tokenizer(input_dir, output_dir): |
|
print(f"Converting tokenizer from {input_dir} to {output_dir}") |
|
|
|
|
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
|
|
json_files = [ |
|
'tokenizer.json', |
|
'tokenizer_config.json', |
|
'special_tokens_map.json', |
|
'added_tokens.json' |
|
] |
|
|
|
|
|
copy_files = [ |
|
'merges.txt' |
|
] |
|
|
|
|
|
print("\nFound files:") |
|
for f in os.listdir(input_dir): |
|
print(f"- {f}") |
|
|
|
|
|
for filename in json_files: |
|
input_path = os.path.join(input_dir, filename) |
|
if os.path.exists(input_path): |
|
print(f"\nProcessing {filename}") |
|
content = safe_read_json(input_path) |
|
if content is not None: |
|
output_path = os.path.join(output_dir, filename) |
|
with open(output_path, 'w', encoding='utf-8') as f: |
|
json.dump(content, f, indent=2) |
|
print(f"Successfully copied {filename}") |
|
|
|
|
|
for filename in copy_files: |
|
input_path = os.path.join(input_dir, filename) |
|
if os.path.exists(input_path): |
|
print(f"\nCopying {filename}") |
|
safe_copy_file(input_path, os.path.join(output_dir, filename)) |
|
|
|
|
|
config_path = os.path.join(input_dir, 'tokenizer_config.json') |
|
if os.path.exists(config_path): |
|
print("\nProcessing tokenizer config") |
|
config = safe_read_json(config_path) |
|
if config is not None: |
|
config.update({ |
|
'add_prefix_space': False, |
|
'clean_up_tokenization_spaces': False, |
|
'model_max_length': 16384, |
|
'tokenizer_class': 'GPT2Tokenizer', |
|
'bos_token': '<|endoftext|>', |
|
'eos_token': '<|endoftext|>', |
|
'pad_token': '<|endoftext|>' |
|
}) |
|
|
|
|
|
output_config_path = os.path.join(output_dir, 'tokenizer_config.json') |
|
with open(output_config_path, 'w') as f: |
|
json.dump(config, f, indent=2) |
|
print("Successfully updated config") |
|
|
|
|
|
print("\nConstructing vocabulary...") |
|
tokenizer_path = os.path.join(output_dir, "tokenizer.json") |
|
tokenizer_data = safe_read_json(tokenizer_path) |
|
if tokenizer_data is None: |
|
print("Error: Unable to read tokenizer.json") |
|
return |
|
|
|
vocab = tokenizer_data["model"]["vocab"] |
|
added_tokens = tokenizer_data.get("added_tokens", []) |
|
|
|
for token_data in added_tokens: |
|
content = token_data["content"] |
|
if content not in vocab: |
|
vocab[content] = token_data["id"] |
|
|
|
vocab_size = len(vocab) |
|
print(f"Vocabulary size: {vocab_size}") |
|
|
|
|
|
vocab_output_path = os.path.join(output_dir, "vocab.json") |
|
with open(vocab_output_path, 'w', encoding='utf-8') as f: |
|
json.dump(vocab, f, indent=2) |
|
print(f"Successfully saved vocabulary to {vocab_output_path}") |
|
|
|
print("\nAttempting to test tokenizer...") |
|
try: |
|
tokenizer = AutoTokenizer.from_pretrained(output_dir) |
|
test_text = "Hello, world!" |
|
tokens = tokenizer.encode(test_text) |
|
decoded = tokenizer.decode(tokens) |
|
print("Tokenizer test successful!") |
|
print(f"Test text: {test_text}") |
|
print(f"Encoded: {tokens}") |
|
print(f"Decoded: {decoded}") |
|
|
|
|
|
if test_text != decoded: |
|
print("Decoded text does not match original text!") |
|
else: |
|
print("Decoded text matches original text!") |
|
|
|
tokenizer.save_pretrained(output_dir) |
|
print(f"Tokenizer saved to {output_dir}") |
|
|
|
except Exception as e: |
|
print(f"Error testing tokenizer: {e}") |
|
|
|
if __name__ == "__main__": |
|
input_dir = "/mnt/llm/models/phi-4/model" |
|
output_dir = "/mnt/llm/models/phi-4/converted_tokenizer" |
|
|
|
convert_phi_tokenizer(input_dir, output_dir) |
|
|