from transformers import AutoTokenizer import json import os import shutil def safe_read_json(filepath): try: with open(filepath, 'r', encoding='utf-8') as f: return json.load(f) except FileNotFoundError: print(f"File not found: {filepath}") return None except Exception as e: print(f"Error reading {filepath}: {str(e)}") return None def safe_copy_file(src, dst): try: shutil.copy2(src, dst) print(f"Successfully copied {os.path.basename(src)}") return True except Exception as e: print(f"Error copying {src}: {str(e)}") return False def convert_phi_tokenizer(input_dir, output_dir): print(f"Converting tokenizer from {input_dir} to {output_dir}") # Ensure output directory exists os.makedirs(output_dir, exist_ok=True) # JSON files to process json_files = [ 'tokenizer.json', 'tokenizer_config.json', 'special_tokens_map.json', 'added_tokens.json' # Moved added_tokens.json here ] # Files to copy directly (no JSON parsing) copy_files = [ 'merges.txt' ] # List what files we actually find print("\nFound files:") for f in os.listdir(input_dir): print(f"- {f}") # Process JSON files for filename in json_files: input_path = os.path.join(input_dir, filename) if os.path.exists(input_path): print(f"\nProcessing {filename}") content = safe_read_json(input_path) if content is not None: output_path = os.path.join(output_dir, filename) with open(output_path, 'w', encoding='utf-8') as f: json.dump(content, f, indent=2) print(f"Successfully copied {filename}") # Copy non-JSON files directly for filename in copy_files: input_path = os.path.join(input_dir, filename) if os.path.exists(input_path): print(f"\nCopying {filename}") safe_copy_file(input_path, os.path.join(output_dir, filename)) # Load and modify the tokenizer config config_path = os.path.join(input_dir, 'tokenizer_config.json') if os.path.exists(config_path): print("\nProcessing tokenizer config") config = safe_read_json(config_path) if config is not None: config.update({ 'add_prefix_space': False, 'clean_up_tokenization_spaces': False, 'model_max_length': 16384, 'tokenizer_class': 'GPT2Tokenizer', # Changed to GPT2Tokenizer 'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'pad_token': '<|endoftext|>' }) # Save the modified config output_config_path = os.path.join(output_dir, 'tokenizer_config.json') with open(output_config_path, 'w') as f: json.dump(config, f, indent=2) print("Successfully updated config") # Construct the vocabulary with added tokens print("\nConstructing vocabulary...") tokenizer_path = os.path.join(output_dir, "tokenizer.json") tokenizer_data = safe_read_json(tokenizer_path) if tokenizer_data is None: print("Error: Unable to read tokenizer.json") return vocab = tokenizer_data["model"]["vocab"] added_tokens = tokenizer_data.get("added_tokens", []) for token_data in added_tokens: content = token_data["content"] if content not in vocab: vocab[content] = token_data["id"] vocab_size = len(vocab) print(f"Vocabulary size: {vocab_size}") # Save the vocabulary as vocab.json vocab_output_path = os.path.join(output_dir, "vocab.json") with open(vocab_output_path, 'w', encoding='utf-8') as f: json.dump(vocab, f, indent=2) print(f"Successfully saved vocabulary to {vocab_output_path}") print("\nAttempting to test tokenizer...") try: tokenizer = AutoTokenizer.from_pretrained(output_dir) test_text = "Hello, world!" tokens = tokenizer.encode(test_text) decoded = tokenizer.decode(tokens) print("Tokenizer test successful!") print(f"Test text: {test_text}") print(f"Encoded: {tokens}") print(f"Decoded: {decoded}") # check if they're the same if test_text != decoded: print("Decoded text does not match original text!") else: print("Decoded text matches original text!") # save the tokenizer tokenizer.save_pretrained(output_dir) print(f"Tokenizer saved to {output_dir}") except Exception as e: print(f"Error testing tokenizer: {e}") if __name__ == "__main__": input_dir = "/mnt/llm/models/phi-4/model" # or "model" depending on which directory you want to use output_dir = "/mnt/llm/models/phi-4/converted_tokenizer" convert_phi_tokenizer(input_dir, output_dir)