hexgrad/Kokoro-82M · Windows ready script, including chunking too long input by sentences and then token count (modest count). Use input.txt file as your text input for the model. Tested on Windows 11

2 days ago

•

from kokoro import generate
import torch
import os
from models import build_model
from scipy.io.wavfile import write
import numpy as np
import nltk
from transformers import AutoTokenizer

# Download NLTK tokenizer resources (run this once)
nltk.download('punkt')

# Ensure eSpeak NG paths
os.environ["PHONEMIZER_ESPEAK_LIBRARY"] = r"C:\\Program Files\\eSpeak NG\\libespeak-ng.dll"
os.environ["PHONEMIZER_ESPEAK_PATH"] = r"C:\\Program Files\\eSpeak NG\\espeak-ng.exe"

# Set device
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load Kokoro model
try:
    MODEL = build_model('kokoro-v0_19.pth', device)
except FileNotFoundError:
    raise FileNotFoundError("Model file 'kokoro-v0_19.pth' not found. Ensure the file exists in the specified path.")

# Set voice name and load voicepack
VOICE_NAME = 'af'
try:
    VOICEPACK = torch.load(f'voices/{VOICE_NAME}.pt', weights_only=True).to(device)
except FileNotFoundError:
    raise FileNotFoundError(f"Voicepack file 'voices/{VOICE_NAME}.pt' not found. Ensure the voice file exists.")

# Read text input from input.txt
input_file = "input.txt"
try:
    with open(input_file, "r", encoding="utf-8") as f:
        text = f.read().strip()
    if not text:
        raise ValueError("Input text file is empty.")
except FileNotFoundError:
    raise FileNotFoundError(f"Input file '{input_file}' not found. Please create the file in the same directory.")
except Exception as e:
    raise RuntimeError(f"Error reading input file: {str(e)}")

# Load Hugging Face tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Effective Kokoro token limit
EFFECTIVE_TOKEN_LIMIT = 50  # Conservative limit to avoid truncation

# Split text into sentences
def split_text_into_sentence_chunks(text, max_tokens):
    sentences = nltk.sent_tokenize(text)  # Split into sentences
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        # Tokenize the sentence
        sentence_tokens = tokenizer.encode(sentence, add_special_tokens=False)
        sentence_length = len(sentence_tokens)

        if current_length + sentence_length > max_tokens:
            # If adding this sentence exceeds the token limit, finalize the current chunk
            if current_chunk:
                chunks.append(" ".join(current_chunk))
            current_chunk = [sentence]
            current_length = sentence_length
        else:
            # Add the sentence to the current chunk
            current_chunk.append(sentence)
            current_length += sentence_length

    # Add any remaining sentences to the last chunk
    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

# Split text into manageable chunks
text_chunks = split_text_into_sentence_chunks(text, max_tokens=EFFECTIVE_TOKEN_LIMIT)

# Generate and concatenate audio for each chunk
audio_segments = []
try:
    for i, chunk in enumerate(text_chunks, start=1):
        print(f"Processing chunk {i}/{len(text_chunks)}: {chunk[:50]}...")
        audio, _ = generate(MODEL, chunk, VOICEPACK, lang=VOICE_NAME[0])
        if isinstance(audio, torch.Tensor):
            audio = audio.cpu().numpy()
        audio_segments.append(audio)

    # Concatenate all audio segments
    full_audio = np.concatenate(audio_segments)
except Exception as e:
    raise RuntimeError(f"Error during audio generation: {str(e)}")

# Save concatenated audio to .wav file
output_path = "output.wav"
try:
    write(output_path, 24000, full_audio.astype(np.float32))
    print(f"Audio saved successfully to {output_path}")
except Exception as e:
    raise RuntimeError(f"Error saving audio to .wav: {str(e)}")

ADHDev

2 days ago

•

edited 2 days ago

If you see place for improvement please let me know, this is code made by ChatGPT. Output will be saved as output.wav

JRZ

1 day ago

Thanks for this! With not much fuss this works in 'nix as well.