Spaces:

davit312
/

piper-TTS-Armenian

Runtime error

App Files Files Community

Davit commited on Oct 25, 2024

Commit

c608151

1 Parent(s): 088fbc7

app init

Browse files

Files changed (11) hide show

app.py +46 -0
piper/__init__.py +5 -0
piper/__main__.py +159 -0
piper/config.py +53 -0
piper/const.py +5 -0
piper/download.py +139 -0
piper/file_hash.py +46 -0
piper/util.py +12 -0
piper/voice.py +177 -0
piper/voices.json +0 -0
requirements.txt +4 -0

app.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import gradio as gr
+import wave
+import numpy as np
+from io import BytesIO
+from huggingface_hub import hf_hub_download
+from piper import PiperVoice
+from transformers import pipeline
+import typing
+model_path = hf_hub_download(repo_id="davit312/hy-tts", filename="hye_AM-gor-medium.onnx")
+config_path = hf_hub_download(repo_id="davit312/hy-tts", filename="hye_AM-gor-medium.onnx.json")
+voice = PiperVoice.load(model_path, config_path)
+def synthesize_speech(text):
+    # Create an in-memory buffer for the WAV file
+    buffer = BytesIO()
+    with wave.open(buffer, 'wb') as wav_file:
+        wav_file.setframerate(voice.config.sample_rate)
+        wav_file.setsampwidth(2)  # 16-bit
+        wav_file.setnchannels(1)  # mono
+        # Synthesize speech
+        # eztext = preprocess_text(text)
+        voice.synthesize(text, wav_file)
+    # Convert buffer to NumPy array for Gradio output
+    buffer.seek(0)
+    audio_data = np.frombuffer(buffer.read(), dtype=np.int16)
+    return audio_data.tobytes(), None
+# Using Gradio Blocks
+with gr.Blocks(theme=gr.themes.Base()) as blocks:
+    gr.Markdown("# Persian Text to Speech Synthesizer")
+    gr.Markdown("Enter text to synthesize it into speech using Piper+Hazm")
+    input_text = gr.Textbox(label="Input text", lines=4)
+    output_audio = gr.Audio(label="Synthesized Speech", type="numpy")
+    output_text = gr.Textbox(label="Output Text", visible=False)  # This is the new text output component
+    submit_button = gr.Button("Synthesize")
+    submit_button.click(synthesize_speech, inputs=input_text, outputs=[output_audio, output_text])
+# Run the app
+blocks.launch()

piper/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .voice import PiperVoice
+__all__ = [
+    "PiperVoice",
+]

piper/__main__.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import argparse
+import logging
+import sys
+import time
+import wave
+from pathlib import Path
+from typing import Any, Dict
+from . import PiperVoice
+from .download import ensure_voice_exists, find_voice, get_voices
+_FILE = Path(__file__)
+_DIR = _FILE.parent
+_LOGGER = logging.getLogger(_FILE.stem)
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-m", "--model", required=True, help="Path to Onnx model file")
+    parser.add_argument("-c", "--config", help="Path to model config file")
+    parser.add_argument(
+        "-f",
+        "--output-file",
+        "--output_file",
+        help="Path to output WAV file (default: stdout)",
+    )
+    parser.add_argument(
+        "-d",
+        "--output-dir",
+        "--output_dir",
+        help="Path to output directory (default: cwd)",
+    )
+    parser.add_argument(
+        "--output-raw",
+        "--output_raw",
+        action="store_true",
+        help="Stream raw audio to stdout",
+    )
+    #
+    parser.add_argument("-s", "--speaker", type=int, help="Id of speaker (default: 0)")
+    parser.add_argument(
+        "--length-scale", "--length_scale", type=float, help="Phoneme length"
+    )
+    parser.add_argument(
+        "--noise-scale", "--noise_scale", type=float, help="Generator noise"
+    )
+    parser.add_argument(
+        "--noise-w", "--noise_w", type=float, help="Phoneme width noise"
+    )
+    #
+    parser.add_argument("--cuda", action="store_true", help="Use GPU")
+    #
+    parser.add_argument(
+        "--sentence-silence",
+        "--sentence_silence",
+        type=float,
+        default=0.0,
+        help="Seconds of silence after each sentence",
+    )
+    #
+    parser.add_argument(
+        "--data-dir",
+        "--data_dir",
+        action="append",
+        default=[str(Path.cwd())],
+        help="Data directory to check for downloaded models (default: current directory)",
+    )
+    parser.add_argument(
+        "--download-dir",
+        "--download_dir",
+        help="Directory to download voices into (default: first data dir)",
+    )
+    #
+    parser.add_argument(
+        "--update-voices",
+        action="store_true",
+        help="Download latest voices.json during startup",
+    )
+    #
+    parser.add_argument(
+        "--debug", action="store_true", help="Print DEBUG messages to console"
+    )
+    args = parser.parse_args()
+    logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)
+    _LOGGER.debug(args)
+    if not args.download_dir:
+        # Download to first data directory by default
+        args.download_dir = args.data_dir[0]
+    # Download voice if file doesn't exist
+    model_path = Path(args.model)
+    if not model_path.exists():
+        # Load voice info
+        voices_info = get_voices(args.download_dir, update_voices=args.update_voices)
+        # Resolve aliases for backwards compatibility with old voice names
+        aliases_info: Dict[str, Any] = {}
+        for voice_info in voices_info.values():
+            for voice_alias in voice_info.get("aliases", []):
+                aliases_info[voice_alias] = {"_is_alias": True, **voice_info}
+        voices_info.update(aliases_info)
+        ensure_voice_exists(args.model, args.data_dir, args.download_dir, voices_info)
+        args.model, args.config = find_voice(args.model, args.data_dir)
+    # Load voice
+    voice = PiperVoice.load(args.model, config_path=args.config, use_cuda=args.cuda)
+    synthesize_args = {
+        "speaker_id": args.speaker,
+        "length_scale": args.length_scale,
+        "noise_scale": args.noise_scale,
+        "noise_w": args.noise_w,
+        "sentence_silence": args.sentence_silence,
+    }
+    if args.output_raw:
+        # Read line-by-line
+        for line in sys.stdin:
+            line = line.strip()
+            if not line:
+                continue
+            # Write raw audio to stdout as its produced
+            audio_stream = voice.synthesize_stream_raw(line, **synthesize_args)
+            for audio_bytes in audio_stream:
+                sys.stdout.buffer.write(audio_bytes)
+                sys.stdout.buffer.flush()
+    elif args.output_dir:
+        output_dir = Path(args.output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+        # Read line-by-line
+        for line in sys.stdin:
+            line = line.strip()
+            if not line:
+                continue
+            wav_path = output_dir / f"{time.monotonic_ns()}.wav"
+            with wave.open(str(wav_path), "wb") as wav_file:
+                voice.synthesize(line, wav_file, **synthesize_args)
+            _LOGGER.info("Wrote %s", wav_path)
+    else:
+        # Read entire input
+        text = sys.stdin.read()
+        if (not args.output_file) or (args.output_file == "-"):
+            # Write to stdout
+            with wave.open(sys.stdout.buffer, "wb") as wav_file:
+                voice.synthesize(text, wav_file, **synthesize_args)
+        else:
+            # Write to file
+            with wave.open(args.output_file, "wb") as wav_file:
+                voice.synthesize(text, wav_file, **synthesize_args)
+if __name__ == "__main__":
+    main()

piper/config.py ADDED Viewed

	@@ -0,0 +1,53 @@

+"""Piper configuration"""
+from dataclasses import dataclass
+from enum import Enum
+from typing import Any, Dict, Mapping, Sequence
+class PhonemeType(str, Enum):
+    ESPEAK = "espeak"
+    TEXT = "text"
+@dataclass
+class PiperConfig:
+    """Piper configuration"""
+    num_symbols: int
+    """Number of phonemes"""
+    num_speakers: int
+    """Number of speakers"""
+    sample_rate: int
+    """Sample rate of output audio"""
+    espeak_voice: str
+    """Name of espeak-ng voice or alphabet"""
+    length_scale: float
+    noise_scale: float
+    noise_w: float
+    phoneme_id_map: Mapping[str, Sequence[int]]
+    """Phoneme -> [id,]"""
+    phoneme_type: PhonemeType
+    """espeak or text"""
+    @staticmethod
+    def from_dict(config: Dict[str, Any]) -> "PiperConfig":
+        inference = config.get("inference", {})
+        return PiperConfig(
+            num_symbols=config["num_symbols"],
+            num_speakers=config["num_speakers"],
+            sample_rate=config["audio"]["sample_rate"],
+            noise_scale=inference.get("noise_scale", 0.667),
+            length_scale=inference.get("length_scale", 1.0),
+            noise_w=inference.get("noise_w", 0.8),
+            #
+            espeak_voice=config["espeak"]["voice"],
+            phoneme_id_map=config["phoneme_id_map"],
+            phoneme_type=PhonemeType(config.get("phoneme_type", PhonemeType.ESPEAK)),
+        )

piper/const.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""Constants"""
+PAD = "_"  # padding (0)
+BOS = "^"  # beginning of sentence
+EOS = "$"  # end of sentence

piper/download.py ADDED Viewed

	@@ -0,0 +1,139 @@

+"""Utility for downloading Piper voices."""
+import json
+import logging
+import shutil
+from pathlib import Path
+from typing import Any, Dict, Iterable, Set, Tuple, Union
+from urllib.request import urlopen
+from .file_hash import get_file_hash
+URL_FORMAT = "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/{file}"
+_DIR = Path(__file__).parent
+_LOGGER = logging.getLogger(__name__)
+_SKIP_FILES = {"MODEL_CARD"}
+class VoiceNotFoundError(Exception):
+    pass
+def get_voices(
+    download_dir: Union[str, Path], update_voices: bool = False
+) -> Dict[str, Any]:
+    """Loads available voices from downloaded or embedded JSON file."""
+    download_dir = Path(download_dir)
+    voices_download = download_dir / "voices.json"
+    if update_voices:
+        # Download latest voices.json
+        voices_url = URL_FORMAT.format(file="voices.json")
+        _LOGGER.debug("Downloading %s to %s", voices_url, voices_download)
+        with urlopen(voices_url) as response, open(
+            voices_download, "wb"
+        ) as download_file:
+            shutil.copyfileobj(response, download_file)
+    # Prefer downloaded file to embedded
+    voices_embedded = _DIR / "voices.json"
+    voices_path = voices_download if voices_download.exists() else voices_embedded
+    _LOGGER.debug("Loading %s", voices_path)
+    with open(voices_path, "r", encoding="utf-8") as voices_file:
+        return json.load(voices_file)
+def ensure_voice_exists(
+    name: str,
+    data_dirs: Iterable[Union[str, Path]],
+    download_dir: Union[str, Path],
+    voices_info: Dict[str, Any],
+):
+    assert data_dirs, "No data dirs"
+    if name not in voices_info:
+        raise VoiceNotFoundError(name)
+    voice_info = voices_info[name]
+    voice_files = voice_info["files"]
+    files_to_download: Set[str] = set()
+    for data_dir in data_dirs:
+        data_dir = Path(data_dir)
+        # Check sizes/hashes
+        for file_path, file_info in voice_files.items():
+            if file_path in files_to_download:
+                # Already planning to download
+                continue
+            file_name = Path(file_path).name
+            if file_name in _SKIP_FILES:
+                continue
+            data_file_path = data_dir / file_name
+            _LOGGER.debug("Checking %s", data_file_path)
+            if not data_file_path.exists():
+                _LOGGER.debug("Missing %s", data_file_path)
+                files_to_download.add(file_path)
+                continue
+            expected_size = file_info["size_bytes"]
+            actual_size = data_file_path.stat().st_size
+            if expected_size != actual_size:
+                _LOGGER.warning(
+                    "Wrong size (expected=%s, actual=%s) for %s",
+                    expected_size,
+                    actual_size,
+                    data_file_path,
+                )
+                files_to_download.add(file_path)
+                continue
+            expected_hash = file_info["md5_digest"]
+            actual_hash = get_file_hash(data_file_path)
+            if expected_hash != actual_hash:
+                _LOGGER.warning(
+                    "Wrong hash (expected=%s, actual=%s) for %s",
+                    expected_hash,
+                    actual_hash,
+                    data_file_path,
+                )
+                files_to_download.add(file_path)
+                continue
+    if (not voice_files) and (not files_to_download):
+        raise ValueError(f"Unable to find or download voice: {name}")
+    # Download missing files
+    download_dir = Path(download_dir)
+    for file_path in files_to_download:
+        file_name = Path(file_path).name
+        if file_name in _SKIP_FILES:
+            continue
+        file_url = URL_FORMAT.format(file=file_path)
+        download_file_path = download_dir / file_name
+        download_file_path.parent.mkdir(parents=True, exist_ok=True)
+        _LOGGER.debug("Downloading %s to %s", file_url, download_file_path)
+        with urlopen(file_url) as response, open(
+            download_file_path, "wb"
+        ) as download_file:
+            shutil.copyfileobj(response, download_file)
+        _LOGGER.info("Downloaded %s (%s)", download_file_path, file_url)
+def find_voice(name: str, data_dirs: Iterable[Union[str, Path]]) -> Tuple[Path, Path]:
+    for data_dir in data_dirs:
+        data_dir = Path(data_dir)
+        onnx_path = data_dir / f"{name}.onnx"
+        config_path = data_dir / f"{name}.onnx.json"
+        if onnx_path.exists() and config_path.exists():
+            return onnx_path, config_path
+    raise ValueError(f"Missing files for voice {name}")

piper/file_hash.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import argparse
+import hashlib
+import json
+import sys
+from pathlib import Path
+from typing import Union
+def get_file_hash(path: Union[str, Path], bytes_per_chunk: int = 8192) -> str:
+    """Hash a file in chunks using md5."""
+    path_hash = hashlib.md5()
+    with open(path, "rb") as path_file:
+        chunk = path_file.read(bytes_per_chunk)
+        while chunk:
+            path_hash.update(chunk)
+            chunk = path_file.read(bytes_per_chunk)
+    return path_hash.hexdigest()
+# -----------------------------------------------------------------------------
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("file", nargs="+")
+    parser.add_argument("--dir", help="Parent directory")
+    args = parser.parse_args()
+    if args.dir:
+        args.dir = Path(args.dir)
+    hashes = {}
+    for path_str in args.file:
+        path = Path(path_str)
+        path_hash = get_file_hash(path)
+        if args.dir:
+            path = path.relative_to(args.dir)
+        hashes[str(path)] = path_hash
+    json.dump(hashes, sys.stdout)
+if __name__ == "__main__":
+    main()

piper/util.py ADDED Viewed

	@@ -0,0 +1,12 @@

+"""Utilities"""
+import numpy as np
+def audio_float_to_int16(
+    audio: np.ndarray, max_wav_value: float = 32767.0
+) -> np.ndarray:
+    """Normalize audio and convert to int16 range"""
+    audio_norm = audio * (max_wav_value / max(0.01, np.max(np.abs(audio))))
+    audio_norm = np.clip(audio_norm, -max_wav_value, max_wav_value)
+    audio_norm = audio_norm.astype("int16")
+    return audio_norm

piper/voice.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import json
+import logging
+import wave
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Iterable, List, Optional, Union
+import numpy as np
+import onnxruntime
+from piper_phonemize import phonemize_codepoints, phonemize_espeak, tashkeel_run
+from .config import PhonemeType, PiperConfig
+from .const import BOS, EOS, PAD
+from .util import audio_float_to_int16
+_LOGGER = logging.getLogger(__name__)
+@dataclass
+class PiperVoice:
+    session: onnxruntime.InferenceSession
+    config: PiperConfig
+    @staticmethod
+    def load(
+        model_path: Union[str, Path],
+        config_path: Optional[Union[str, Path]] = None,
+        use_cuda: bool = False,
+    ) -> "PiperVoice":
+        """Load an ONNX model and config."""
+        if config_path is None:
+            config_path = f"{model_path}.json"
+        with open(config_path, "r", encoding="utf-8") as config_file:
+            config_dict = json.load(config_file)
+        return PiperVoice(
+            config=PiperConfig.from_dict(config_dict),
+            session=onnxruntime.InferenceSession(
+                str(model_path),
+                sess_options=onnxruntime.SessionOptions(),
+                providers=["CPUExecutionProvider"]
+                if not use_cuda
+                else ["CUDAExecutionProvider"],
+            ),
+        )
+    def phonemize(self, text: str) -> List[List[str]]:
+        """Text to phonemes grouped by sentence."""
+        if self.config.phoneme_type == PhonemeType.ESPEAK:
+            if self.config.espeak_voice == "ar":
+                # Arabic diacritization
+                # https://github.com/mush42/libtashkeel/
+                text = tashkeel_run(text)
+            return phonemize_espeak(text, self.config.espeak_voice)
+        if self.config.phoneme_type == PhonemeType.TEXT:
+            return phonemize_codepoints(text)
+        raise ValueError(f"Unexpected phoneme type: {self.config.phoneme_type}")
+    def phonemes_to_ids(self, phonemes: List[str]) -> List[int]:
+        """Phonemes to ids."""
+        id_map = self.config.phoneme_id_map
+        ids: List[int] = list(id_map[BOS])
+        for phoneme in phonemes:
+            if phoneme not in id_map:
+                _LOGGER.warning("Missing phoneme from id map: %s", phoneme)
+                continue
+            ids.extend(id_map[phoneme])
+            ids.extend(id_map[PAD])
+        ids.extend(id_map[EOS])
+        return ids
+    def synthesize(
+        self,
+        text: str,
+        wav_file: wave.Wave_write,
+        speaker_id: Optional[int] = None,
+        length_scale: Optional[float] = None,
+        noise_scale: Optional[float] = None,
+        noise_w: Optional[float] = None,
+        sentence_silence: float = 0.0,
+    ):
+        """Synthesize WAV audio from text."""
+        wav_file.setframerate(self.config.sample_rate)
+        wav_file.setsampwidth(2)  # 16-bit
+        wav_file.setnchannels(1)  # mono
+        for audio_bytes in self.synthesize_stream_raw(
+            text,
+            speaker_id=speaker_id,
+            length_scale=length_scale,
+            noise_scale=noise_scale,
+            noise_w=noise_w,
+            sentence_silence=sentence_silence,
+        ):
+            wav_file.writeframes(audio_bytes)
+    def synthesize_stream_raw(
+        self,
+        text: str,
+        speaker_id: Optional[int] = None,
+        length_scale: Optional[float] = None,
+        noise_scale: Optional[float] = None,
+        noise_w: Optional[float] = None,
+        sentence_silence: float = 0.0,
+    ) -> Iterable[bytes]:
+        """Synthesize raw audio per sentence from text."""
+        sentence_phonemes = self.phonemize(text)
+        # 16-bit mono
+        num_silence_samples = int(sentence_silence * self.config.sample_rate)
+        silence_bytes = bytes(num_silence_samples * 2)
+        for phonemes in sentence_phonemes:
+            phoneme_ids = self.phonemes_to_ids(phonemes)
+            yield self.synthesize_ids_to_raw(
+                phoneme_ids,
+                speaker_id=speaker_id,
+                length_scale=length_scale,
+                noise_scale=noise_scale,
+                noise_w=noise_w,
+            ) + silence_bytes
+    def synthesize_ids_to_raw(
+        self,
+        phoneme_ids: List[int],
+        speaker_id: Optional[int] = None,
+        length_scale: Optional[float] = None,
+        noise_scale: Optional[float] = None,
+        noise_w: Optional[float] = None,
+    ) -> bytes:
+        """Synthesize raw audio from phoneme ids."""
+        if length_scale is None:
+            length_scale = self.config.length_scale
+        if noise_scale is None:
+            noise_scale = self.config.noise_scale
+        if noise_w is None:
+            noise_w = self.config.noise_w
+        phoneme_ids_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
+        phoneme_ids_lengths = np.array([phoneme_ids_array.shape[1]], dtype=np.int64)
+        scales = np.array(
+            [noise_scale, length_scale, noise_w],
+            dtype=np.float32,
+        )
+        if (self.config.num_speakers > 1) and (speaker_id is None):
+            # Default speaker
+            speaker_id = 0
+        sid = None
+        if speaker_id is not None:
+            sid = np.array([speaker_id], dtype=np.int64)
+        # Synthesize through Onnx
+        audio = self.session.run(
+            None,
+            {
+                "input": phoneme_ids_array,
+                "input_lengths": phoneme_ids_lengths,
+                "scales": scales,
+                "sid": sid,
+            },
+        )[0].squeeze((0, 1))
+        audio = audio_float_to_int16(audio.squeeze())
+        return audio.tobytes()

piper/voices.json ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+transformers
+piper-tts
+piper-phonemize~=1.1.0
+onnxruntime>=1.11.0,<2