Spaces:

Wismut
/

Kokoro_TTS_Compare

Running

File size: 17,786 Bytes

2eaa44a

#!/usr/bin/env python3
# tts_cli.py
"""
Example CLI for generating audio with Kokoro-StyleTTS2.

Usage:
  python tts_cli.py \
    --model /path/to/kokoro-v0_19.pth \
    --config /path/to/config.json \
    --text "Hello, my stinking friends from 1906! You stink." \
    --voicepack /path/to/af.pt \
    --output output.wav

Make sure:
1. `models.py` is in the same folder (with `build_model`, `Decoder`, etc.).
2. You have installed the needed libraries:
   pip install torch phonemizer munch soundfile pyyaml
3. The model is a checkpoint that your `build_model` can load.

Adapt as needed!
"""

import argparse
import os
import re
import torch
import soundfile as sf
import numpy as np
from phonemizer import backend as phonemizer_backend

# If you use eSpeak library:
try:
    from espeak_util import set_espeak_library

    set_espeak_library()
except ImportError:
    pass

# --------------------------------------------------------------------
# Import from your local `models.py` (requires that file to be present).
#   This example assumes `build_model` loads the entire TTS submodules
#   (bert, bert_encoder, predictor, decoder, text_encoder).
# --------------------------------------------------------------------
from models import build_model


def resplit_strings(arr):
    """
    Given a list of string tokens (e.g. words, phrases), tries to
    split them into two sub-lists whose total lengths are as balanced
    as possible. The goal is to chunk a large string in half without
    splitting in the middle of a word.
    """
    if not arr:
        return "", ""
    if len(arr) == 1:
        return arr[0], ""

    min_diff = float("inf")
    best_split = 0
    lengths = [len(s) for s in arr]
    spaces = len(arr) - 1
    left_len = 0
    right_len = sum(lengths) + spaces

    for i in range(1, len(arr)):
        # Add current word + space to left side
        left_len += lengths[i - 1] + (1 if i > 1 else 0)
        # Remove from right side
        right_len -= lengths[i - 1] + 1
        diff = abs(left_len - right_len)
        if diff < min_diff:
            min_diff = diff
            best_split = i

    return " ".join(arr[:best_split]), " ".join(arr[best_split:])


def recursive_split(text, lang="a"):
    """
    Splits a piece of text into smaller segments so that
    each segment's phoneme length < some ~limit (~500 tokens).
    """
    # We'll reuse your existing `phonemize_text` + `tokenize` from script 1
    # to see if it is < 512 tokens. If it is, return it as a single chunk.
    # Otherwise, split on punctuation or whitespace and recurse.

    # 1. Phonemize first, check length
    ps = phonemize_text(text, lang=lang, do_normalize=True)
    tokens = tokenize(ps)
    if len(tokens) < 512:
        return [(text, ps)]

    # If too large, we split on certain punctuation or fallback to whitespace
    # We'll look for punctuation that often indicates sentence boundaries
    # If none found, fallback to space-split
    for punctuation in [r"[.?!…]", r"[:,;—]"]:
        pattern = f"(?:(?<={punctuation})|(?<={punctuation}[\"'»])) "
        # Attempt to split on that punctuation
        splits = re.split(pattern, text)
        if len(splits) > 1:
            break
    else:
        # If we didn't break out, just do whitespace split
        splits = text.split(" ")

    # Use resplit_strings to chunk it about halfway
    left, right = resplit_strings(splits)
    # Recurse
    return recursive_split(left, lang=lang) + recursive_split(right, lang=lang)


def segment_and_tokenize(long_text, lang="a"):
    """
    Takes a large text, optionally normalizes or cleans it,
    then breaks it into a list of (segment_text, segment_phonemes).
    """
    # Additional cleaning if you want:
    # long_text = normalize_text(long_text) # your existing function
    # We chunk it up using recursive_split
    segments = recursive_split(long_text, lang=lang)
    return segments


# -------------- Normalization & Phonemization Routines -------------- #
def parens_to_angles(s):
    return s.replace("(", "«").replace(")", "»")


def split_num(num):
    num = num.group()
    if "." in num:
        return num
    elif ":" in num:
        h, m = [int(n) for n in num.split(":")]
        if m == 0:
            return f"{h} o'clock"
        elif m < 10:
            return f"{h} oh {m}"
        return f"{h} {m}"
    year = int(num[:4])
    if year < 1100 or year % 1000 < 10:
        return num
    left, right = num[:2], int(num[2:4])
    s = "s" if num.endswith("s") else ""
    if 100 <= year % 1000 <= 999:
        if right == 0:
            return f"{left} hundred{s}"
        elif right < 10:
            return f"{left} oh {right}{s}"
    return f"{left} {right}{s}"


def flip_money(m):
    m = m.group()
    bill = "dollar" if m[0] == "$" else "pound"
    if m[-1].isalpha():
        return f"{m[1:]} {bill}s"
    elif "." not in m:
        s = "" if m[1:] == "1" else "s"
        return f"{m[1:]} {bill}{s}"
    b, c = m[1:].split(".")
    s = "" if b == "1" else "s"
    c = int(c.ljust(2, "0"))
    coins = (
        f"cent{'' if c == 1 else 's'}"
        if m[0] == "$"
        else ("penny" if c == 1 else "pence")
    )
    return f"{b} {bill}{s} and {c} {coins}"


def point_num(num):
    a, b = num.group().split(".")
    return " point ".join([a, " ".join(b)])


def normalize_text(text):
    text = text.replace(chr(8216), "'").replace(chr(8217), "'")
    text = text.replace("«", chr(8220)).replace("»", chr(8221))
    text = text.replace(chr(8220), '"').replace(chr(8221), '"')
    text = parens_to_angles(text)

    # Replace some common full-width punctuation in CJK:
    for a, b in zip("、。！，：；？", ",.!,:;?"):
        text = text.replace(a, b + " ")

    text = re.sub(r"[^\S \n]", " ", text)
    text = re.sub(r"  +", " ", text)
    text = re.sub(r"(?<=\n) +(?=\n)", "", text)
    text = re.sub(r"\bD[Rr]\.(?= [A-Z])", "Doctor", text)
    text = re.sub(r"\b(?:Mr\.|MR\.(?= [A-Z]))", "Mister", text)
    text = re.sub(r"\b(?:Ms\.|MS\.(?= [A-Z]))", "Miss", text)
    text = re.sub(r"\b(?:Mrs\.|MRS\.(?= [A-Z]))", "Mrs", text)
    text = re.sub(r"\betc\.(?! [A-Z])", "etc", text)
    text = re.sub(r"(?i)\b(y)eah?\b", r"\1e'a", text)
    text = re.sub(
        r"\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)",
        split_num,
        text,
    )
    text = re.sub(r"(?<=\d),(?=\d)", "", text)
    text = re.sub(
        r"(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b",
        flip_money,
        text,
    )
    text = re.sub(r"\d*\.\d+", point_num, text)
    text = re.sub(r"(?<=\d)-(?=\d)", " to ", text)  # Could be minus; adjust if needed
    text = re.sub(r"(?<=\d)S", " S", text)
    text = re.sub(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b", "'S", text)
    text = re.sub(r"(?<=X')S\b", "s", text)
    text = re.sub(
        r"(?:[A-Za-z]\.){2,} [a-z]", lambda m: m.group().replace(".", "-"), text
    )
    text = re.sub(r"(?i)(?<=[A-Z])\.(?=[A-Z])", "-", text)
    return text.strip()


# -------------------------------------------------------------------
# Vocab and Symbol Mapping
# -------------------------------------------------------------------
def get_vocab():
    _pad = "$"
    _punctuation = ';:,.!?¡¿—…"«»“” '
    _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
    _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
    symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
    dicts = {}
    for i, s in enumerate(symbols):
        dicts[s] = i
    return dicts


VOCAB = get_vocab()


def tokenize(ps: str):
    """Convert the phoneme string into integer tokens based on VOCAB."""
    return [VOCAB.get(p) for p in ps if p in VOCAB]


# -------------------------------------------------------------------
# Initialize a simple phonemizer
#   For English:
#       'a' ~ en-us
#       'b' ~ en-gb
# -------------------------------------------------------------------
phonemizers = dict(
    a=phonemizer_backend.EspeakBackend(
        language="en-us", preserve_punctuation=True, with_stress=True
    ),
    b=phonemizer_backend.EspeakBackend(
        language="en-gb", preserve_punctuation=True, with_stress=True
    ),
    # You can add more, e.g. 'j': some Japanese phonemizer, etc.
)


def phonemize_text(text, lang="a", do_normalize=True):
    if do_normalize:
        text = normalize_text(text)
    ps_list = phonemizers[lang].phonemize([text])
    ps = ps_list[0] if ps_list else ""

    # Some custom replacements (from your code)
    ps = ps.replace("kəkˈoːɹoʊ", "kˈoʊkəɹoʊ").replace("kəkˈɔːɹəʊ", "kˈəʊkəɹəʊ")
    ps = ps.replace("ʲ", "j").replace("r", "ɹ").replace("x", "k").replace("ɬ", "l")
    # Example: insert space before "hˈʌndɹɪd" if there's a letter, e.g. "nˈaɪn" => "nˈaɪn hˈʌndɹɪd"
    ps = re.sub(r"(?<=[a-zɹː])(?=hˈʌndɹɪd)", " ", ps)
    # "z" at the end of a word -> remove space (just your snippet)
    ps = re.sub(r' z(?=[;:,.!?¡¿—…"«»“” ]|$)', "z", ps)
    # If lang is 'a', handle "ninety" => "ninedi"? Just from your snippet:
    if lang == "a":
        ps = re.sub(r"(?<=nˈaɪn)ti(?!ː)", "di", ps)

    # Only keep valid symbols
    ps = "".join(p for p in ps if p in VOCAB)
    return ps.strip()


# -------------------------------------------------------------------
# Utility for generating text masks
# -------------------------------------------------------------------
def length_to_mask(lengths):
    # lengths is a Tensor of shape [B], containing the text length for each batch
    max_len = lengths.max()
    row_ids = torch.arange(max_len, device=lengths.device).unsqueeze(0)
    mask = row_ids.expand(lengths.shape[0], -1)
    return (mask + 1) > lengths.unsqueeze(1)


# -------------------------------------------------------------------
# The forward pass for inference (from your snippet).
# This version references `model.predictor`, `model.decoder`, etc.
# -------------------------------------------------------------------
@torch.no_grad()
def forward_tts(model, tokens, ref_s, speed=1.0):
    """
    model:   Munch with submodels: bert, bert_encoder, predictor, decoder, text_encoder
    tokens:  list[int], the tokenized input (without [0, ... , 0] yet)
    ref_s:   reference embedding (torch.Tensor)
    speed:   float, speed factor
    """
    device = ref_s.device
    tokens_t = torch.LongTensor([[0, *tokens, 0]]).to(device)  # add boundary tokens
    input_lengths = torch.LongTensor([tokens_t.shape[-1]]).to(device)
    text_mask = length_to_mask(input_lengths).to(device)

    # 1. Encode with BERT
    bert_dur = model.bert(tokens_t, attention_mask=(~text_mask).int())
    d_en = model.bert_encoder(bert_dur).transpose(-1, -2)

    # 2. Prosody predictor
    s = ref_s[
        :, 128:
    ]  # from your snippet: the last 128 is ???, or the first 128 is ???

    d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
    x, _ = model.predictor.lstm(d)
    duration = model.predictor.duration_proj(x)
    duration = torch.sigmoid(duration).sum(axis=-1) / speed
    pred_dur = torch.round(duration).clamp(min=1).long()

    # 3. Expand alignment
    total_len = pred_dur.sum().item()
    pred_aln_trg = torch.zeros(input_lengths, total_len, device=device)
    c_frame = 0
    for i in range(pred_aln_trg.size(0)):
        n = pred_dur[0, i].item()
        pred_aln_trg[i, c_frame : c_frame + n] = 1
        c_frame += n

    # 4. Run F0 + Noise predictor
    en = d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0)
    F0_pred, N_pred = model.predictor.F0Ntrain(en, s)

    # 5. Text encoder -> asr
    t_en = model.text_encoder(tokens_t, input_lengths, text_mask)
    asr = t_en @ pred_aln_trg.unsqueeze(0)

    # 6. Decode audio
    audio = model.decoder(asr, F0_pred, N_pred, ref_s[:, :128])  # B x audio_len
    return audio.squeeze().cpu().numpy()


def generate_tts(model, text, voicepack, lang="a", speed=1.0):
    """
    model:    the Munch returned by build_model(...)
    text:     the input text (string)
    voicepack: the torch Tensor reference embedding, or a dict of them
    lang:     'a' or 'b' or etc. from your phonemizers
    speed:    speech speed factor
    sample_rate: sampling rate for the output
    """
    # 1. Phonemize
    ps = phonemize_text(text, lang=lang, do_normalize=True)
    tokens = tokenize(ps)
    if not tokens:
        return None, ps

    # 2. Retrieve reference style
    #    If your voicepack is a single embedding for all lengths, adapt as needed.
    #    If your voicepack is something like `voicepack[len(tokens)]`, do that.
    #    If you have multiple voices, you might do something else.
    try:
        ref_s = voicepack[len(tokens)]
    except:
        # fallback if len(tokens) is out of range
        ref_s = voicepack[-1]
    ref_s = ref_s.to("cpu" if not next(model.bert.parameters()).is_cuda else "cuda")

    # 3. Generate
    audio = forward_tts(model, tokens, ref_s, speed=speed)
    return audio, ps


def generate_long_form_tts(model, full_text, voicepack, lang="a", speed=1.0):
    """
    Generate TTS for a large `full_text`, splitting it into smaller segments
    and concatenating the resulting audio.

    Returns: (np.float32 array) final_audio, list_of_segment_phonemes
    """
    # 1. Segment the text
    segments = segment_and_tokenize(full_text, lang=lang)
    # segments is a list of (seg_text, seg_phonemes)

    # 2. For each segment, call `generate_tts(...)`
    audio_chunks = []
    all_phonemes = []
    for i, (seg_text, seg_ps) in enumerate(segments, 1):
        print(f"[LongForm] Generating chunk {i}/{len(segments)}: {seg_text[:40]}...")
        audio, used_phonemes = generate_tts(
            model, seg_text, voicepack, lang=lang, speed=speed
        )
        if audio is not None:
            audio_chunks.append(audio)
            all_phonemes.append(used_phonemes)
        else:
            print(f"[LongForm] Skipped empty segment {i}...")

    if not audio_chunks:
        return None, []

    # 3. Concatenate the audio
    final_audio = np.concatenate(audio_chunks, axis=0)
    return final_audio, all_phonemes


# -------------------------------------------------------------------
# Main CLI
# -------------------------------------------------------------------
def main():
    parser = argparse.ArgumentParser(description="Kokoro-StyleTTS2 CLI Example")
    parser.add_argument(
        "--model",
        type=str,
        default="pretrained_models/Kokoro/kokoro-v0_19.pth",
        help="Path to your model checkpoint (e.g. kokoro-v0_19.pth).",
    )
    parser.add_argument(
        "--config",
        type=str,
        default="pretrained_models/Kokoro/config.json",
        help="Path to config.json (used by build_model).",
    )
    parser.add_argument(
        "--text",
        type=str,
        default="Hello world! This is Kokoro, a new text-to-speech model based on StyleTTS2 from 2024!",
        help="Text to be converted into speech.",
    )
    parser.add_argument(
        "--voicepack",
        type=str,
        default="pretrained_models/Kokoro/voices/af.pt",
        help="Path to a .pt file for your reference embedding(s).",
    )
    parser.add_argument(
        "--output", type=str, default="output.wav", help="Output WAV filename."
    )
    parser.add_argument(
        "--speed",
        type=float,
        default=1.0,
        help="Speech speed factor, e.g. 0.8 slower, 1.2 faster, etc.",
    )
    parser.add_argument(
        "--device",
        type=str,
        default="cpu",
        choices=["cpu", "cuda"],
        help="Device to run inference on.",
    )
    args = parser.parse_args()

    # 1. Build model using your local build_model function
    #    (which loads TextEncoder, Decoder, etc. and returns a Munch).
    if not os.path.isfile(args.config):
        raise FileNotFoundError(f"config.json not found: {args.config}")

    # Optionally load config as Munch (depends on your build_model usage)
    # But your snippet does something like:
    #   with open(config, 'r') as r: ...
    #   ...
    #   model = build_model(path, device)
    # We'll do the same but in a simpler form:
    device = (
        args.device if (args.device == "cuda" and torch.cuda.is_available()) else "cpu"
    )
    print(f"Loading model from: {args.model}")
    model = build_model(
        args.model, device
    )  # This requires that `args.model` is the checkpoint path

    # Because `build_model` returns a Munch (dict of submodules),
    # we can't just do `model.eval()`, we must set each submodule to eval:
    for k, subm in model.items():
        if isinstance(subm, torch.nn.Module):
            subm.eval()

    # 2. Load voicepack
    if not os.path.isfile(args.voicepack):
        raise FileNotFoundError(f"Voicepack file not found: {args.voicepack}")
    print(f"Loading voicepack from: {args.voicepack}")
    vp = torch.load(args.voicepack, map_location=device)
    # If your voicepack is an nn.Module, set it to eval as well
    if isinstance(vp, torch.nn.Module):
        vp.eval()

    # 3. Generate audio
    print(f"Generating speech for text: {args.text}")
    audio, phonemes = generate_long_form_tts(
        model, args.text, vp, lang="a", speed=args.speed
    )
    if audio is None:
        print("No tokens were generated (maybe empty text?). Exiting.")
        return

    # 4. Write WAV
    print(f"Writing output to: {args.output}")
    sf.write(args.output, audio, 22050)

    print("Finished!")
    print(f"Phonemes used: {phonemes}")


if __name__ == "__main__":
    main()