File size: 2,960 Bytes
8d763f2 f519fc8 8d763f2 f519fc8 8d763f2 f519fc8 8d763f2 f519fc8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
## VCTK
import torch
import commons
import utils
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence
import time
from scipy.io.wavfile import write
def get_text(text, hps):
text_norm = text_to_sequence(text, hps.data.text_cleaners)
if hps.data.add_blank:
text_norm = commons.intersperse(text_norm, 0)
print(text, text_norm)
text_norm = torch.LongTensor(text_norm)
return text_norm
LANG = 'all'
CONFIG_PATH = f"./configs/{LANG}_base.json"
MODEL_PATH = f"./logs/{LANG}_base/G_250000.pth"
#TEXT = "こんにちは。韓国のロボットを見に日本からここまで来てくれたのに苦労しました。日本語も上手ですか?"
#TEXT = "Ciao. È una giornata davvero calda. piacere di conoscerti"
#CONFIG_PATH = f"./configs/{LANG}_base.json"
#MODEL_PATH = f"./logs/{LANG}_base/G_210000.pth"
#TEXT = "안녕하세요. 저는 서큘러스의 인공지능 파이온 입니다. 앞으로 로봇시대를 이끌어 나가도록 하겠습니다!"
#TEXT= "你好。 我是 Pion,Circulus 的人工智能。让我们引领未来机器人时代!"
TEXT = "I am artificial intelligent voice made by circulus. It is the way."
#TEXT = f"Hola. encantado de conocerlo ¿Qué estás haciendo?"
#TEXT = "नमस्ते। मेरा नाम पिबो है, सर्कुलस का रोबोट। आपसे मिलकर अच्छा लगा"
#SPK_ID = 45
#SPK_ID = 20
#OUTPUT_WAV_PATH = "vits_test"
hps = utils.get_hparams_from_file(CONFIG_PATH)
if (
"use_mel_posterior_encoder" in hps.model.keys()
and hps.model.use_mel_posterior_encoder == True
):
print("Using mel posterior encoder for VITS2")
posterior_channels = 80 # vits2
hps.data.use_mel_posterior_encoder = True
else:
print("Using lin posterior encoder for VITS1")
posterior_channels = hps.data.filter_length // 2 + 1
hps.data.use_mel_posterior_encoder = False
net_g = SynthesizerTrn(
len(symbols),
posterior_channels,
hps.train.segment_size // hps.data.hop_length,
n_speakers=hps.data.n_speakers,
**hps.model
)
_ = net_g.eval()
_ = utils.load_checkpoint(MODEL_PATH, net_g, None)
hps.data.text_cleaners = ["canvers_en_cleaners"]
stn_tst = get_text(TEXT, hps)
with torch.no_grad():
for i in range(0,hps.data.n_speakers):
start = time.time()
x_tst = stn_tst.unsqueeze(0)
x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
sid = torch.LongTensor([i])
audio = (
net_g.infer(
x_tst,
x_tst_lengths,
sid=sid,
noise_scale=0.667,
noise_scale_w=0.8,
length_scale=1,
)[0][0, 0]
.data
.float()
.numpy()
)
print(i, time.time() - start)
write(data=audio, rate=hps.data.sampling_rate, filename=f"t_{LANG}_{i}.wav")
|