# phi_create_tokenizer_model.py
# This script converts tokenizer.json to tokenizer.model and vocab.json to vocab.txt
import json
import os
import shutil

import sentencepiece as spm
from transformers import AutoTokenizer

def convert_to_sentencepiece(input_dir, output_dir):
    print(f"Converting tokenizer from {input_dir} to {output_dir}")

    # Ensure a working tokenizer by copying all files
    os.makedirs(output_dir, exist_ok=True)
    for filename in os.listdir(input_dir):
        if filename.startswith("tokenizer"):
            shutil.copyfile(os.path.join(input_dir, filename), os.path.join(output_dir, filename))

    # Read tokenizer.json to get the vocabulary and added_tokens
    tokenizer_path = os.path.join(input_dir, "tokenizer.json")
    with open(tokenizer_path, 'r', encoding='utf-8') as f:
        tokenizer_data = json.load(f)

    vocab = tokenizer_data["model"]["vocab"]
    added_tokens = tokenizer_data["added_tokens"]

    # Add the added tokens to the vocabulary with their correct IDs
    for token_data in added_tokens:
        vocab[token_data["content"]] = token_data["id"]

    # Create a temporary vocabulary file for SentencePiece
    temp_vocab_file = os.path.join(output_dir, "temp_vocab.txt")
    with open(temp_vocab_file, "w", encoding="utf-8") as f:
        # Sort by token ID to maintain correct order
        for token, index in sorted(vocab.items(), key=lambda x: x[1]):
            # SentencePiece expects tab-separated format: token<tab>score
            f.write(f"{token}\t1.0\n")

    print("\nCreating SentencePiece model...")

    # Train the SentencePiece model using the vocabulary
    spm.SentencePieceTrainer.train(
        input=temp_vocab_file,
        model_prefix=os.path.join(output_dir, "tokenizer"),
        vocab_size=len(vocab),
        model_type='bpe',
        character_coverage=1.0,
        input_format='tsv',
        train_extremely_large_corpus=True,
        bos_id=-1,  # No beginning of sentence token
        eos_id=-1,  # No end of sentence token
        pad_id=-1,  # No padding token
        unk_id=0,   # Unknown token ID
        max_sentence_length=131072,  # Increased to 128K tokens for RoPE
        num_threads=16  # Adjust based on your system's capabilities
    )

    # Clean up temporary file
    os.remove(temp_vocab_file)

    print("SentencePiece model created successfully")

    # Test the original tokenizer for comparison
    test_text = "Hello, world!"
    tokenizer = AutoTokenizer.from_pretrained(input_dir)
    tokens_orig = tokenizer.encode(test_text)

    # Test the SentencePiece model
    sp = spm.SentencePieceProcessor()
    sp.load(os.path.join(output_dir, "tokenizer.model"))
    tokens_sp = sp.encode_as_ids(test_text)

    print("\nTokenizer comparison test:")
    print(f"Original tokenizer: {tokens_orig}")
    print(f"SentencePiece tokenizer: {tokens_sp}")

if __name__ == "__main__":
    input_dir = "/mnt/llm/models/phi-4/model"
    output_dir = "/mnt/llm/models/phi-4/converted_tokenizer"

    convert_to_sentencepiece(input_dir, output_dir)