File size: 6,652 Bytes
ad16788 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 |
from pathlib import Path
from typing import Iterable
from typing import List
from typing import Optional
from typing import Union
import g2p_en
from typeguard import check_argument_types
from espnet2.text.abs_tokenizer import AbsTokenizer
def split_by_space(text) -> List[str]:
return text.split(" ")
def pyopenjtalk_g2p(text) -> List[str]:
import pyopenjtalk
# phones is a str object separated by space
phones = pyopenjtalk.g2p(text, kana=False)
phones = phones.split(" ")
return phones
def pyopenjtalk_g2p_accent(text) -> List[str]:
import pyopenjtalk
import re
phones = []
for labels in pyopenjtalk.run_frontend(text)[1]:
p = re.findall(r"\-(.*?)\+.*?\/A:([0-9\-]+).*?\/F:.*?_([0-9])", labels)
if len(p) == 1:
phones += [p[0][0], p[0][2], p[0][1]]
return phones
def pyopenjtalk_g2p_accent_with_pause(text) -> List[str]:
import pyopenjtalk
import re
phones = []
for labels in pyopenjtalk.run_frontend(text)[1]:
if labels.split("-")[1].split("+")[0] == "pau":
phones += ["pau"]
continue
p = re.findall(r"\-(.*?)\+.*?\/A:([0-9\-]+).*?\/F:.*?_([0-9])", labels)
if len(p) == 1:
phones += [p[0][0], p[0][2], p[0][1]]
return phones
def pyopenjtalk_g2p_kana(text) -> List[str]:
import pyopenjtalk
kanas = pyopenjtalk.g2p(text, kana=True)
return list(kanas)
def pypinyin_g2p(text) -> List[str]:
from pypinyin import pinyin
from pypinyin import Style
phones = [phone[0] for phone in pinyin(text, style=Style.TONE3)]
return phones
def pypinyin_g2p_phone(text) -> List[str]:
from pypinyin import pinyin
from pypinyin import Style
from pypinyin.style._utils import get_finals
from pypinyin.style._utils import get_initials
phones = [
p
for phone in pinyin(text, style=Style.TONE3)
for p in [
get_initials(phone[0], strict=True),
get_finals(phone[0], strict=True),
]
if len(p) != 0
]
return phones
class G2p_en:
"""On behalf of g2p_en.G2p.
g2p_en.G2p isn't pickalable and it can't be copied to the other processes
via multiprocessing module.
As a workaround, g2p_en.G2p is instantiated upon calling this class.
"""
def __init__(self, no_space: bool = False):
self.no_space = no_space
self.g2p = None
def __call__(self, text) -> List[str]:
if self.g2p is None:
self.g2p = g2p_en.G2p()
phones = self.g2p(text)
if self.no_space:
# remove space which represents word serapater
phones = list(filter(lambda s: s != " ", phones))
return phones
class Phonemizer:
"""Phonemizer module for various languages.
This is wrapper module of https://github.com/bootphon/phonemizer.
You can define various g2p modules by specifying options for phonemizer.
See available options:
https://github.com/bootphon/phonemizer/blob/master/phonemizer/phonemize.py#L32
"""
def __init__(
self,
word_separator: Optional[str] = None,
syllable_separator: Optional[str] = None,
**phonemize_kwargs,
):
# delayed import
from phonemizer import phonemize
from phonemizer.separator import Separator
self.phonemize = phonemize
self.separator = Separator(
word=word_separator, syllable=syllable_separator, phone=" "
)
self.phonemize_kwargs = phonemize_kwargs
def __call__(self, text) -> List[str]:
return self.phonemize(
text,
separator=self.separator,
**self.phonemize_kwargs,
).split()
class PhonemeTokenizer(AbsTokenizer):
def __init__(
self,
g2p_type: Union[None, str],
non_linguistic_symbols: Union[Path, str, Iterable[str]] = None,
space_symbol: str = "<space>",
remove_non_linguistic_symbols: bool = False,
):
assert check_argument_types()
if g2p_type is None:
self.g2p = split_by_space
elif g2p_type == "g2p_en":
self.g2p = G2p_en(no_space=False)
elif g2p_type == "g2p_en_no_space":
self.g2p = G2p_en(no_space=True)
elif g2p_type == "pyopenjtalk":
self.g2p = pyopenjtalk_g2p
elif g2p_type == "pyopenjtalk_kana":
self.g2p = pyopenjtalk_g2p_kana
elif g2p_type == "pyopenjtalk_accent":
self.g2p = pyopenjtalk_g2p_accent
elif g2p_type == "pyopenjtalk_accent_with_pause":
self.g2p = pyopenjtalk_g2p_accent_with_pause
elif g2p_type == "pypinyin_g2p":
self.g2p = pypinyin_g2p
elif g2p_type == "pypinyin_g2p_phone":
self.g2p = pypinyin_g2p_phone
elif g2p_type == "espeak_ng_arabic":
self.g2p = Phonemizer(language="ar", backend="espeak", with_stress=True)
else:
raise NotImplementedError(f"Not supported: g2p_type={g2p_type}")
self.g2p_type = g2p_type
self.space_symbol = space_symbol
if non_linguistic_symbols is None:
self.non_linguistic_symbols = set()
elif isinstance(non_linguistic_symbols, (Path, str)):
non_linguistic_symbols = Path(non_linguistic_symbols)
with non_linguistic_symbols.open("r", encoding="utf-8") as f:
self.non_linguistic_symbols = set(line.rstrip() for line in f)
else:
self.non_linguistic_symbols = set(non_linguistic_symbols)
self.remove_non_linguistic_symbols = remove_non_linguistic_symbols
def __repr__(self):
return (
f"{self.__class__.__name__}("
f'g2p_type="{self.g2p_type}", '
f'space_symbol="{self.space_symbol}", '
f'non_linguistic_symbols="{self.non_linguistic_symbols}"'
f")"
)
def text2tokens(self, line: str) -> List[str]:
tokens = []
while len(line) != 0:
for w in self.non_linguistic_symbols:
if line.startswith(w):
if not self.remove_non_linguistic_symbols:
tokens.append(line[: len(w)])
line = line[len(w) :]
break
else:
t = line[0]
tokens.append(t)
line = line[1:]
line = "".join(tokens)
tokens = self.g2p(line)
return tokens
def tokens2text(self, tokens: Iterable[str]) -> str:
# phoneme type is not invertible
return "".join(tokens)
|