Windows ready script, including chunking too long input by sentences and then token count (modest count). Use input.txt file as your text input for the model. Tested on Windows 11

#64
by ADHDev - opened
from kokoro import generate
import torch
import os
from models import build_model
from scipy.io.wavfile import write
import numpy as np
import nltk
from transformers import AutoTokenizer

# Download NLTK tokenizer resources (run this once)
nltk.download('punkt')

# Ensure eSpeak NG paths
os.environ["PHONEMIZER_ESPEAK_LIBRARY"] = r"C:\\Program Files\\eSpeak NG\\libespeak-ng.dll"
os.environ["PHONEMIZER_ESPEAK_PATH"] = r"C:\\Program Files\\eSpeak NG\\espeak-ng.exe"

# Set device
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load Kokoro model
try:
    MODEL = build_model('kokoro-v0_19.pth', device)
except FileNotFoundError:
    raise FileNotFoundError("Model file 'kokoro-v0_19.pth' not found. Ensure the file exists in the specified path.")

# Set voice name and load voicepack
VOICE_NAME = 'af'
try:
    VOICEPACK = torch.load(f'voices/{VOICE_NAME}.pt', weights_only=True).to(device)
except FileNotFoundError:
    raise FileNotFoundError(f"Voicepack file 'voices/{VOICE_NAME}.pt' not found. Ensure the voice file exists.")

# Read text input from input.txt
input_file = "input.txt"
try:
    with open(input_file, "r", encoding="utf-8") as f:
        text = f.read().strip()
    if not text:
        raise ValueError("Input text file is empty.")
except FileNotFoundError:
    raise FileNotFoundError(f"Input file '{input_file}' not found. Please create the file in the same directory.")
except Exception as e:
    raise RuntimeError(f"Error reading input file: {str(e)}")

# Load Hugging Face tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Effective Kokoro token limit
EFFECTIVE_TOKEN_LIMIT = 50  # Conservative limit to avoid truncation

# Split text into sentences
def split_text_into_sentence_chunks(text, max_tokens):
    sentences = nltk.sent_tokenize(text)  # Split into sentences
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        # Tokenize the sentence
        sentence_tokens = tokenizer.encode(sentence, add_special_tokens=False)
        sentence_length = len(sentence_tokens)

        if current_length + sentence_length > max_tokens:
            # If adding this sentence exceeds the token limit, finalize the current chunk
            if current_chunk:
                chunks.append(" ".join(current_chunk))
            current_chunk = [sentence]
            current_length = sentence_length
        else:
            # Add the sentence to the current chunk
            current_chunk.append(sentence)
            current_length += sentence_length

    # Add any remaining sentences to the last chunk
    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

# Split text into manageable chunks
text_chunks = split_text_into_sentence_chunks(text, max_tokens=EFFECTIVE_TOKEN_LIMIT)

# Generate and concatenate audio for each chunk
audio_segments = []
try:
    for i, chunk in enumerate(text_chunks, start=1):
        print(f"Processing chunk {i}/{len(text_chunks)}: {chunk[:50]}...")
        audio, _ = generate(MODEL, chunk, VOICEPACK, lang=VOICE_NAME[0])
        if isinstance(audio, torch.Tensor):
            audio = audio.cpu().numpy()
        audio_segments.append(audio)

    # Concatenate all audio segments
    full_audio = np.concatenate(audio_segments)
except Exception as e:
    raise RuntimeError(f"Error during audio generation: {str(e)}")

# Save concatenated audio to .wav file
output_path = "output.wav"
try:
    write(output_path, 24000, full_audio.astype(np.float32))
    print(f"Audio saved successfully to {output_path}")
except Exception as e:
    raise RuntimeError(f"Error saving audio to .wav: {str(e)}")

If you see place for improvement please let me know, this is code made by ChatGPT. Output will be saved as output.wav

Thanks for this! With not much fuss this works in 'nix as well.

Sign up or log in to comment