Spaces:
Sleeping
Sleeping
aigmixer
commited on
Commit
•
479b96c
1
Parent(s):
f0b147b
upload piper scripts
Browse files- piper/__init__.py +5 -0
- piper/__main__.py +159 -0
- piper/config.py +53 -0
- piper/const.py +5 -0
- piper/download.py +139 -0
- piper/file_hash.py +46 -0
- piper/util.py +12 -0
- piper/voice.py +177 -0
- piper/voices.json +0 -0
piper/__init__.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .voice import PiperVoice
|
2 |
+
|
3 |
+
__all__ = [
|
4 |
+
"PiperVoice",
|
5 |
+
]
|
piper/__main__.py
ADDED
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import logging
|
3 |
+
import sys
|
4 |
+
import time
|
5 |
+
import wave
|
6 |
+
from pathlib import Path
|
7 |
+
from typing import Any, Dict
|
8 |
+
|
9 |
+
from . import PiperVoice
|
10 |
+
from .download import ensure_voice_exists, find_voice, get_voices
|
11 |
+
|
12 |
+
_FILE = Path(__file__)
|
13 |
+
_DIR = _FILE.parent
|
14 |
+
_LOGGER = logging.getLogger(_FILE.stem)
|
15 |
+
|
16 |
+
|
17 |
+
def main() -> None:
|
18 |
+
parser = argparse.ArgumentParser()
|
19 |
+
parser.add_argument("-m", "--model", required=True, help="Path to Onnx model file")
|
20 |
+
parser.add_argument("-c", "--config", help="Path to model config file")
|
21 |
+
parser.add_argument(
|
22 |
+
"-f",
|
23 |
+
"--output-file",
|
24 |
+
"--output_file",
|
25 |
+
help="Path to output WAV file (default: stdout)",
|
26 |
+
)
|
27 |
+
parser.add_argument(
|
28 |
+
"-d",
|
29 |
+
"--output-dir",
|
30 |
+
"--output_dir",
|
31 |
+
help="Path to output directory (default: cwd)",
|
32 |
+
)
|
33 |
+
parser.add_argument(
|
34 |
+
"--output-raw",
|
35 |
+
"--output_raw",
|
36 |
+
action="store_true",
|
37 |
+
help="Stream raw audio to stdout",
|
38 |
+
)
|
39 |
+
#
|
40 |
+
parser.add_argument("-s", "--speaker", type=int, help="Id of speaker (default: 0)")
|
41 |
+
parser.add_argument(
|
42 |
+
"--length-scale", "--length_scale", type=float, help="Phoneme length"
|
43 |
+
)
|
44 |
+
parser.add_argument(
|
45 |
+
"--noise-scale", "--noise_scale", type=float, help="Generator noise"
|
46 |
+
)
|
47 |
+
parser.add_argument(
|
48 |
+
"--noise-w", "--noise_w", type=float, help="Phoneme width noise"
|
49 |
+
)
|
50 |
+
#
|
51 |
+
parser.add_argument("--cuda", action="store_true", help="Use GPU")
|
52 |
+
#
|
53 |
+
parser.add_argument(
|
54 |
+
"--sentence-silence",
|
55 |
+
"--sentence_silence",
|
56 |
+
type=float,
|
57 |
+
default=0.0,
|
58 |
+
help="Seconds of silence after each sentence",
|
59 |
+
)
|
60 |
+
#
|
61 |
+
parser.add_argument(
|
62 |
+
"--data-dir",
|
63 |
+
"--data_dir",
|
64 |
+
action="append",
|
65 |
+
default=[str(Path.cwd())],
|
66 |
+
help="Data directory to check for downloaded models (default: current directory)",
|
67 |
+
)
|
68 |
+
parser.add_argument(
|
69 |
+
"--download-dir",
|
70 |
+
"--download_dir",
|
71 |
+
help="Directory to download voices into (default: first data dir)",
|
72 |
+
)
|
73 |
+
#
|
74 |
+
parser.add_argument(
|
75 |
+
"--update-voices",
|
76 |
+
action="store_true",
|
77 |
+
help="Download latest voices.json during startup",
|
78 |
+
)
|
79 |
+
#
|
80 |
+
parser.add_argument(
|
81 |
+
"--debug", action="store_true", help="Print DEBUG messages to console"
|
82 |
+
)
|
83 |
+
args = parser.parse_args()
|
84 |
+
logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)
|
85 |
+
_LOGGER.debug(args)
|
86 |
+
|
87 |
+
if not args.download_dir:
|
88 |
+
# Download to first data directory by default
|
89 |
+
args.download_dir = args.data_dir[0]
|
90 |
+
|
91 |
+
# Download voice if file doesn't exist
|
92 |
+
model_path = Path(args.model)
|
93 |
+
if not model_path.exists():
|
94 |
+
# Load voice info
|
95 |
+
voices_info = get_voices(args.download_dir, update_voices=args.update_voices)
|
96 |
+
|
97 |
+
# Resolve aliases for backwards compatibility with old voice names
|
98 |
+
aliases_info: Dict[str, Any] = {}
|
99 |
+
for voice_info in voices_info.values():
|
100 |
+
for voice_alias in voice_info.get("aliases", []):
|
101 |
+
aliases_info[voice_alias] = {"_is_alias": True, **voice_info}
|
102 |
+
|
103 |
+
voices_info.update(aliases_info)
|
104 |
+
ensure_voice_exists(args.model, args.data_dir, args.download_dir, voices_info)
|
105 |
+
args.model, args.config = find_voice(args.model, args.data_dir)
|
106 |
+
|
107 |
+
# Load voice
|
108 |
+
voice = PiperVoice.load(args.model, config_path=args.config, use_cuda=args.cuda)
|
109 |
+
synthesize_args = {
|
110 |
+
"speaker_id": args.speaker,
|
111 |
+
"length_scale": args.length_scale,
|
112 |
+
"noise_scale": args.noise_scale,
|
113 |
+
"noise_w": args.noise_w,
|
114 |
+
"sentence_silence": args.sentence_silence,
|
115 |
+
}
|
116 |
+
|
117 |
+
if args.output_raw:
|
118 |
+
# Read line-by-line
|
119 |
+
for line in sys.stdin:
|
120 |
+
line = line.strip()
|
121 |
+
if not line:
|
122 |
+
continue
|
123 |
+
|
124 |
+
# Write raw audio to stdout as its produced
|
125 |
+
audio_stream = voice.synthesize_stream_raw(line, **synthesize_args)
|
126 |
+
for audio_bytes in audio_stream:
|
127 |
+
sys.stdout.buffer.write(audio_bytes)
|
128 |
+
sys.stdout.buffer.flush()
|
129 |
+
elif args.output_dir:
|
130 |
+
output_dir = Path(args.output_dir)
|
131 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
132 |
+
|
133 |
+
# Read line-by-line
|
134 |
+
for line in sys.stdin:
|
135 |
+
line = line.strip()
|
136 |
+
if not line:
|
137 |
+
continue
|
138 |
+
|
139 |
+
wav_path = output_dir / f"{time.monotonic_ns()}.wav"
|
140 |
+
with wave.open(str(wav_path), "wb") as wav_file:
|
141 |
+
voice.synthesize(line, wav_file, **synthesize_args)
|
142 |
+
|
143 |
+
_LOGGER.info("Wrote %s", wav_path)
|
144 |
+
else:
|
145 |
+
# Read entire input
|
146 |
+
text = sys.stdin.read()
|
147 |
+
|
148 |
+
if (not args.output_file) or (args.output_file == "-"):
|
149 |
+
# Write to stdout
|
150 |
+
with wave.open(sys.stdout.buffer, "wb") as wav_file:
|
151 |
+
voice.synthesize(text, wav_file, **synthesize_args)
|
152 |
+
else:
|
153 |
+
# Write to file
|
154 |
+
with wave.open(args.output_file, "wb") as wav_file:
|
155 |
+
voice.synthesize(text, wav_file, **synthesize_args)
|
156 |
+
|
157 |
+
|
158 |
+
if __name__ == "__main__":
|
159 |
+
main()
|
piper/config.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Piper configuration"""
|
2 |
+
from dataclasses import dataclass
|
3 |
+
from enum import Enum
|
4 |
+
from typing import Any, Dict, Mapping, Sequence
|
5 |
+
|
6 |
+
|
7 |
+
class PhonemeType(str, Enum):
|
8 |
+
ESPEAK = "espeak"
|
9 |
+
TEXT = "text"
|
10 |
+
|
11 |
+
|
12 |
+
@dataclass
|
13 |
+
class PiperConfig:
|
14 |
+
"""Piper configuration"""
|
15 |
+
|
16 |
+
num_symbols: int
|
17 |
+
"""Number of phonemes"""
|
18 |
+
|
19 |
+
num_speakers: int
|
20 |
+
"""Number of speakers"""
|
21 |
+
|
22 |
+
sample_rate: int
|
23 |
+
"""Sample rate of output audio"""
|
24 |
+
|
25 |
+
espeak_voice: str
|
26 |
+
"""Name of espeak-ng voice or alphabet"""
|
27 |
+
|
28 |
+
length_scale: float
|
29 |
+
noise_scale: float
|
30 |
+
noise_w: float
|
31 |
+
|
32 |
+
phoneme_id_map: Mapping[str, Sequence[int]]
|
33 |
+
"""Phoneme -> [id,]"""
|
34 |
+
|
35 |
+
phoneme_type: PhonemeType
|
36 |
+
"""espeak or text"""
|
37 |
+
|
38 |
+
@staticmethod
|
39 |
+
def from_dict(config: Dict[str, Any]) -> "PiperConfig":
|
40 |
+
inference = config.get("inference", {})
|
41 |
+
|
42 |
+
return PiperConfig(
|
43 |
+
num_symbols=config["num_symbols"],
|
44 |
+
num_speakers=config["num_speakers"],
|
45 |
+
sample_rate=config["audio"]["sample_rate"],
|
46 |
+
noise_scale=inference.get("noise_scale", 0.667),
|
47 |
+
length_scale=inference.get("length_scale", 1.0),
|
48 |
+
noise_w=inference.get("noise_w", 0.8),
|
49 |
+
#
|
50 |
+
espeak_voice=config["espeak"]["voice"],
|
51 |
+
phoneme_id_map=config["phoneme_id_map"],
|
52 |
+
phoneme_type=PhonemeType(config.get("phoneme_type", PhonemeType.ESPEAK)),
|
53 |
+
)
|
piper/const.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Constants"""
|
2 |
+
|
3 |
+
PAD = "_" # padding (0)
|
4 |
+
BOS = "^" # beginning of sentence
|
5 |
+
EOS = "$" # end of sentence
|
piper/download.py
ADDED
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Utility for downloading Piper voices."""
|
2 |
+
import json
|
3 |
+
import logging
|
4 |
+
import shutil
|
5 |
+
from pathlib import Path
|
6 |
+
from typing import Any, Dict, Iterable, Set, Tuple, Union
|
7 |
+
from urllib.request import urlopen
|
8 |
+
|
9 |
+
from .file_hash import get_file_hash
|
10 |
+
|
11 |
+
URL_FORMAT = "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/{file}"
|
12 |
+
|
13 |
+
_DIR = Path(__file__).parent
|
14 |
+
_LOGGER = logging.getLogger(__name__)
|
15 |
+
|
16 |
+
_SKIP_FILES = {"MODEL_CARD"}
|
17 |
+
|
18 |
+
|
19 |
+
class VoiceNotFoundError(Exception):
|
20 |
+
pass
|
21 |
+
|
22 |
+
|
23 |
+
def get_voices(
|
24 |
+
download_dir: Union[str, Path], update_voices: bool = False
|
25 |
+
) -> Dict[str, Any]:
|
26 |
+
"""Loads available voices from downloaded or embedded JSON file."""
|
27 |
+
download_dir = Path(download_dir)
|
28 |
+
voices_download = download_dir / "voices.json"
|
29 |
+
|
30 |
+
if update_voices:
|
31 |
+
# Download latest voices.json
|
32 |
+
voices_url = URL_FORMAT.format(file="voices.json")
|
33 |
+
_LOGGER.debug("Downloading %s to %s", voices_url, voices_download)
|
34 |
+
with urlopen(voices_url) as response, open(
|
35 |
+
voices_download, "wb"
|
36 |
+
) as download_file:
|
37 |
+
shutil.copyfileobj(response, download_file)
|
38 |
+
|
39 |
+
# Prefer downloaded file to embedded
|
40 |
+
voices_embedded = _DIR / "voices.json"
|
41 |
+
voices_path = voices_download if voices_download.exists() else voices_embedded
|
42 |
+
|
43 |
+
_LOGGER.debug("Loading %s", voices_path)
|
44 |
+
with open(voices_path, "r", encoding="utf-8") as voices_file:
|
45 |
+
return json.load(voices_file)
|
46 |
+
|
47 |
+
|
48 |
+
def ensure_voice_exists(
|
49 |
+
name: str,
|
50 |
+
data_dirs: Iterable[Union[str, Path]],
|
51 |
+
download_dir: Union[str, Path],
|
52 |
+
voices_info: Dict[str, Any],
|
53 |
+
):
|
54 |
+
assert data_dirs, "No data dirs"
|
55 |
+
if name not in voices_info:
|
56 |
+
raise VoiceNotFoundError(name)
|
57 |
+
|
58 |
+
voice_info = voices_info[name]
|
59 |
+
voice_files = voice_info["files"]
|
60 |
+
files_to_download: Set[str] = set()
|
61 |
+
|
62 |
+
for data_dir in data_dirs:
|
63 |
+
data_dir = Path(data_dir)
|
64 |
+
|
65 |
+
# Check sizes/hashes
|
66 |
+
for file_path, file_info in voice_files.items():
|
67 |
+
if file_path in files_to_download:
|
68 |
+
# Already planning to download
|
69 |
+
continue
|
70 |
+
|
71 |
+
file_name = Path(file_path).name
|
72 |
+
if file_name in _SKIP_FILES:
|
73 |
+
continue
|
74 |
+
|
75 |
+
data_file_path = data_dir / file_name
|
76 |
+
_LOGGER.debug("Checking %s", data_file_path)
|
77 |
+
if not data_file_path.exists():
|
78 |
+
_LOGGER.debug("Missing %s", data_file_path)
|
79 |
+
files_to_download.add(file_path)
|
80 |
+
continue
|
81 |
+
|
82 |
+
expected_size = file_info["size_bytes"]
|
83 |
+
actual_size = data_file_path.stat().st_size
|
84 |
+
if expected_size != actual_size:
|
85 |
+
_LOGGER.warning(
|
86 |
+
"Wrong size (expected=%s, actual=%s) for %s",
|
87 |
+
expected_size,
|
88 |
+
actual_size,
|
89 |
+
data_file_path,
|
90 |
+
)
|
91 |
+
files_to_download.add(file_path)
|
92 |
+
continue
|
93 |
+
|
94 |
+
expected_hash = file_info["md5_digest"]
|
95 |
+
actual_hash = get_file_hash(data_file_path)
|
96 |
+
if expected_hash != actual_hash:
|
97 |
+
_LOGGER.warning(
|
98 |
+
"Wrong hash (expected=%s, actual=%s) for %s",
|
99 |
+
expected_hash,
|
100 |
+
actual_hash,
|
101 |
+
data_file_path,
|
102 |
+
)
|
103 |
+
files_to_download.add(file_path)
|
104 |
+
continue
|
105 |
+
|
106 |
+
if (not voice_files) and (not files_to_download):
|
107 |
+
raise ValueError(f"Unable to find or download voice: {name}")
|
108 |
+
|
109 |
+
# Download missing files
|
110 |
+
download_dir = Path(download_dir)
|
111 |
+
|
112 |
+
for file_path in files_to_download:
|
113 |
+
file_name = Path(file_path).name
|
114 |
+
if file_name in _SKIP_FILES:
|
115 |
+
continue
|
116 |
+
|
117 |
+
file_url = URL_FORMAT.format(file=file_path)
|
118 |
+
download_file_path = download_dir / file_name
|
119 |
+
download_file_path.parent.mkdir(parents=True, exist_ok=True)
|
120 |
+
|
121 |
+
_LOGGER.debug("Downloading %s to %s", file_url, download_file_path)
|
122 |
+
with urlopen(file_url) as response, open(
|
123 |
+
download_file_path, "wb"
|
124 |
+
) as download_file:
|
125 |
+
shutil.copyfileobj(response, download_file)
|
126 |
+
|
127 |
+
_LOGGER.info("Downloaded %s (%s)", download_file_path, file_url)
|
128 |
+
|
129 |
+
|
130 |
+
def find_voice(name: str, data_dirs: Iterable[Union[str, Path]]) -> Tuple[Path, Path]:
|
131 |
+
for data_dir in data_dirs:
|
132 |
+
data_dir = Path(data_dir)
|
133 |
+
onnx_path = data_dir / f"{name}.onnx"
|
134 |
+
config_path = data_dir / f"{name}.onnx.json"
|
135 |
+
|
136 |
+
if onnx_path.exists() and config_path.exists():
|
137 |
+
return onnx_path, config_path
|
138 |
+
|
139 |
+
raise ValueError(f"Missing files for voice {name}")
|
piper/file_hash.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import hashlib
|
3 |
+
import json
|
4 |
+
import sys
|
5 |
+
from pathlib import Path
|
6 |
+
from typing import Union
|
7 |
+
|
8 |
+
|
9 |
+
def get_file_hash(path: Union[str, Path], bytes_per_chunk: int = 8192) -> str:
|
10 |
+
"""Hash a file in chunks using md5."""
|
11 |
+
path_hash = hashlib.md5()
|
12 |
+
with open(path, "rb") as path_file:
|
13 |
+
chunk = path_file.read(bytes_per_chunk)
|
14 |
+
while chunk:
|
15 |
+
path_hash.update(chunk)
|
16 |
+
chunk = path_file.read(bytes_per_chunk)
|
17 |
+
|
18 |
+
return path_hash.hexdigest()
|
19 |
+
|
20 |
+
|
21 |
+
# -----------------------------------------------------------------------------
|
22 |
+
|
23 |
+
|
24 |
+
def main():
|
25 |
+
parser = argparse.ArgumentParser()
|
26 |
+
parser.add_argument("file", nargs="+")
|
27 |
+
parser.add_argument("--dir", help="Parent directory")
|
28 |
+
args = parser.parse_args()
|
29 |
+
|
30 |
+
if args.dir:
|
31 |
+
args.dir = Path(args.dir)
|
32 |
+
|
33 |
+
hashes = {}
|
34 |
+
for path_str in args.file:
|
35 |
+
path = Path(path_str)
|
36 |
+
path_hash = get_file_hash(path)
|
37 |
+
if args.dir:
|
38 |
+
path = path.relative_to(args.dir)
|
39 |
+
|
40 |
+
hashes[str(path)] = path_hash
|
41 |
+
|
42 |
+
json.dump(hashes, sys.stdout)
|
43 |
+
|
44 |
+
|
45 |
+
if __name__ == "__main__":
|
46 |
+
main()
|
piper/util.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Utilities"""
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
|
5 |
+
def audio_float_to_int16(
|
6 |
+
audio: np.ndarray, max_wav_value: float = 32767.0
|
7 |
+
) -> np.ndarray:
|
8 |
+
"""Normalize audio and convert to int16 range"""
|
9 |
+
audio_norm = audio * (max_wav_value / max(0.01, np.max(np.abs(audio))))
|
10 |
+
audio_norm = np.clip(audio_norm, -max_wav_value, max_wav_value)
|
11 |
+
audio_norm = audio_norm.astype("int16")
|
12 |
+
return audio_norm
|
piper/voice.py
ADDED
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import logging
|
3 |
+
import wave
|
4 |
+
from dataclasses import dataclass
|
5 |
+
from pathlib import Path
|
6 |
+
from typing import Iterable, List, Optional, Union
|
7 |
+
|
8 |
+
import numpy as np
|
9 |
+
import onnxruntime
|
10 |
+
from piper_phonemize import phonemize_codepoints, phonemize_espeak, tashkeel_run
|
11 |
+
|
12 |
+
from .config import PhonemeType, PiperConfig
|
13 |
+
from .const import BOS, EOS, PAD
|
14 |
+
from .util import audio_float_to_int16
|
15 |
+
|
16 |
+
_LOGGER = logging.getLogger(__name__)
|
17 |
+
|
18 |
+
|
19 |
+
@dataclass
|
20 |
+
class PiperVoice:
|
21 |
+
session: onnxruntime.InferenceSession
|
22 |
+
config: PiperConfig
|
23 |
+
|
24 |
+
@staticmethod
|
25 |
+
def load(
|
26 |
+
model_path: Union[str, Path],
|
27 |
+
config_path: Optional[Union[str, Path]] = None,
|
28 |
+
use_cuda: bool = False,
|
29 |
+
) -> "PiperVoice":
|
30 |
+
"""Load an ONNX model and config."""
|
31 |
+
if config_path is None:
|
32 |
+
config_path = f"{model_path}.json"
|
33 |
+
|
34 |
+
with open(config_path, "r", encoding="utf-8") as config_file:
|
35 |
+
config_dict = json.load(config_file)
|
36 |
+
|
37 |
+
return PiperVoice(
|
38 |
+
config=PiperConfig.from_dict(config_dict),
|
39 |
+
session=onnxruntime.InferenceSession(
|
40 |
+
str(model_path),
|
41 |
+
sess_options=onnxruntime.SessionOptions(),
|
42 |
+
providers=["CPUExecutionProvider"]
|
43 |
+
if not use_cuda
|
44 |
+
else ["CUDAExecutionProvider"],
|
45 |
+
),
|
46 |
+
)
|
47 |
+
|
48 |
+
def phonemize(self, text: str) -> List[List[str]]:
|
49 |
+
"""Text to phonemes grouped by sentence."""
|
50 |
+
if self.config.phoneme_type == PhonemeType.ESPEAK:
|
51 |
+
if self.config.espeak_voice == "ar":
|
52 |
+
# Arabic diacritization
|
53 |
+
# https://github.com/mush42/libtashkeel/
|
54 |
+
text = tashkeel_run(text)
|
55 |
+
|
56 |
+
return phonemize_espeak(text, self.config.espeak_voice)
|
57 |
+
|
58 |
+
if self.config.phoneme_type == PhonemeType.TEXT:
|
59 |
+
return phonemize_codepoints(text)
|
60 |
+
|
61 |
+
raise ValueError(f"Unexpected phoneme type: {self.config.phoneme_type}")
|
62 |
+
|
63 |
+
def phonemes_to_ids(self, phonemes: List[str]) -> List[int]:
|
64 |
+
"""Phonemes to ids."""
|
65 |
+
id_map = self.config.phoneme_id_map
|
66 |
+
ids: List[int] = list(id_map[BOS])
|
67 |
+
|
68 |
+
for phoneme in phonemes:
|
69 |
+
if phoneme not in id_map:
|
70 |
+
_LOGGER.warning("Missing phoneme from id map: %s", phoneme)
|
71 |
+
continue
|
72 |
+
|
73 |
+
ids.extend(id_map[phoneme])
|
74 |
+
ids.extend(id_map[PAD])
|
75 |
+
|
76 |
+
ids.extend(id_map[EOS])
|
77 |
+
|
78 |
+
return ids
|
79 |
+
|
80 |
+
def synthesize(
|
81 |
+
self,
|
82 |
+
text: str,
|
83 |
+
wav_file: wave.Wave_write,
|
84 |
+
speaker_id: Optional[int] = None,
|
85 |
+
length_scale: Optional[float] = None,
|
86 |
+
noise_scale: Optional[float] = None,
|
87 |
+
noise_w: Optional[float] = None,
|
88 |
+
sentence_silence: float = 0.0,
|
89 |
+
):
|
90 |
+
"""Synthesize WAV audio from text."""
|
91 |
+
wav_file.setframerate(self.config.sample_rate)
|
92 |
+
wav_file.setsampwidth(2) # 16-bit
|
93 |
+
wav_file.setnchannels(1) # mono
|
94 |
+
|
95 |
+
for audio_bytes in self.synthesize_stream_raw(
|
96 |
+
text,
|
97 |
+
speaker_id=speaker_id,
|
98 |
+
length_scale=length_scale,
|
99 |
+
noise_scale=noise_scale,
|
100 |
+
noise_w=noise_w,
|
101 |
+
sentence_silence=sentence_silence,
|
102 |
+
):
|
103 |
+
wav_file.writeframes(audio_bytes)
|
104 |
+
|
105 |
+
def synthesize_stream_raw(
|
106 |
+
self,
|
107 |
+
text: str,
|
108 |
+
speaker_id: Optional[int] = None,
|
109 |
+
length_scale: Optional[float] = None,
|
110 |
+
noise_scale: Optional[float] = None,
|
111 |
+
noise_w: Optional[float] = None,
|
112 |
+
sentence_silence: float = 0.0,
|
113 |
+
) -> Iterable[bytes]:
|
114 |
+
"""Synthesize raw audio per sentence from text."""
|
115 |
+
sentence_phonemes = self.phonemize(text)
|
116 |
+
|
117 |
+
# 16-bit mono
|
118 |
+
num_silence_samples = int(sentence_silence * self.config.sample_rate)
|
119 |
+
silence_bytes = bytes(num_silence_samples * 2)
|
120 |
+
|
121 |
+
for phonemes in sentence_phonemes:
|
122 |
+
phoneme_ids = self.phonemes_to_ids(phonemes)
|
123 |
+
yield self.synthesize_ids_to_raw(
|
124 |
+
phoneme_ids,
|
125 |
+
speaker_id=speaker_id,
|
126 |
+
length_scale=length_scale,
|
127 |
+
noise_scale=noise_scale,
|
128 |
+
noise_w=noise_w,
|
129 |
+
) + silence_bytes
|
130 |
+
|
131 |
+
def synthesize_ids_to_raw(
|
132 |
+
self,
|
133 |
+
phoneme_ids: List[int],
|
134 |
+
speaker_id: Optional[int] = None,
|
135 |
+
length_scale: Optional[float] = None,
|
136 |
+
noise_scale: Optional[float] = None,
|
137 |
+
noise_w: Optional[float] = None,
|
138 |
+
) -> bytes:
|
139 |
+
"""Synthesize raw audio from phoneme ids."""
|
140 |
+
if length_scale is None:
|
141 |
+
length_scale = self.config.length_scale
|
142 |
+
|
143 |
+
if noise_scale is None:
|
144 |
+
noise_scale = self.config.noise_scale
|
145 |
+
|
146 |
+
if noise_w is None:
|
147 |
+
noise_w = self.config.noise_w
|
148 |
+
|
149 |
+
phoneme_ids_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
|
150 |
+
phoneme_ids_lengths = np.array([phoneme_ids_array.shape[1]], dtype=np.int64)
|
151 |
+
scales = np.array(
|
152 |
+
[noise_scale, length_scale, noise_w],
|
153 |
+
dtype=np.float32,
|
154 |
+
)
|
155 |
+
|
156 |
+
if (self.config.num_speakers > 1) and (speaker_id is None):
|
157 |
+
# Default speaker
|
158 |
+
speaker_id = 0
|
159 |
+
|
160 |
+
sid = None
|
161 |
+
|
162 |
+
if speaker_id is not None:
|
163 |
+
sid = np.array([speaker_id], dtype=np.int64)
|
164 |
+
|
165 |
+
# Synthesize through Onnx
|
166 |
+
audio = self.session.run(
|
167 |
+
None,
|
168 |
+
{
|
169 |
+
"input": phoneme_ids_array,
|
170 |
+
"input_lengths": phoneme_ids_lengths,
|
171 |
+
"scales": scales,
|
172 |
+
"sid": sid,
|
173 |
+
},
|
174 |
+
)[0].squeeze((0, 1))
|
175 |
+
audio = audio_float_to_int16(audio.squeeze())
|
176 |
+
|
177 |
+
return audio.tobytes()
|
piper/voices.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|