## VCTK import torch import commons import utils from models import SynthesizerTrn from text.symbols import symbols from text import text_to_sequence import time from scipy.io.wavfile import write def get_text(text, hps): text_norm = text_to_sequence(text, hps.data.text_cleaners) if hps.data.add_blank: text_norm = commons.intersperse(text_norm, 0) print(text, text_norm) text_norm = torch.LongTensor(text_norm) return text_norm LANG = 'all' CONFIG_PATH = f"./configs/{LANG}_base.json" MODEL_PATH = f"./logs/{LANG}_base/G_250000.pth" #TEXT = "こんにちは。韓国のロボットを見に日本からここまで来てくれたのに苦労しました。日本語も上手ですか?" #TEXT = "Ciao. È una giornata davvero calda. piacere di conoscerti" #CONFIG_PATH = f"./configs/{LANG}_base.json" #MODEL_PATH = f"./logs/{LANG}_base/G_210000.pth" #TEXT = "안녕하세요. 저는 서큘러스의 인공지능 파이온 입니다. 앞으로 로봇시대를 이끌어 나가도록 하겠습니다!" #TEXT= "你好。 我是 Pion,Circulus 的人工智能。让我们引领未来机器人时代!" TEXT = "I am artificial intelligent voice made by circulus. It is the way." #TEXT = f"Hola. encantado de conocerlo ¿Qué estás haciendo?" #TEXT = "नमस्ते। मेरा नाम पिबो है, सर्कुलस का रोबोट। आपसे मिलकर अच्छा लगा" #SPK_ID = 45 #SPK_ID = 20 #OUTPUT_WAV_PATH = "vits_test" hps = utils.get_hparams_from_file(CONFIG_PATH) if ( "use_mel_posterior_encoder" in hps.model.keys() and hps.model.use_mel_posterior_encoder == True ): print("Using mel posterior encoder for VITS2") posterior_channels = 80 # vits2 hps.data.use_mel_posterior_encoder = True else: print("Using lin posterior encoder for VITS1") posterior_channels = hps.data.filter_length // 2 + 1 hps.data.use_mel_posterior_encoder = False net_g = SynthesizerTrn( len(symbols), posterior_channels, hps.train.segment_size // hps.data.hop_length, n_speakers=hps.data.n_speakers, **hps.model ) _ = net_g.eval() _ = utils.load_checkpoint(MODEL_PATH, net_g, None) hps.data.text_cleaners = ["canvers_en_cleaners"] stn_tst = get_text(TEXT, hps) with torch.no_grad(): for i in range(0,hps.data.n_speakers): start = time.time() x_tst = stn_tst.unsqueeze(0) x_tst_lengths = torch.LongTensor([stn_tst.size(0)]) sid = torch.LongTensor([i]) audio = ( net_g.infer( x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.8, length_scale=1, )[0][0, 0] .data .float() .numpy() ) print(i, time.time() - start) write(data=audio, rate=hps.data.sampling_rate, filename=f"t_{LANG}_{i}.wav")