VietnameseVITS / app.py
chnk58hoang's picture
add time logging
21d1ad5
raw
history blame contribute delete
802 Bytes
import gradio as gr
import numpy as np
from utils import load_model, normalize_text
import time
vits = load_model()
def text_to_speech(text):
""" Text to speech
"""
text = normalize_text(text)
text_inputs = np.asarray(
vits.tokenizer.text_to_ids(text),
dtype=np.int64,
)[None, :]
start = time.perf_counter()
audio = vits.inference_onnx(text_inputs)
end = time.perf_counter()
inference_time = end - start
audio_length = audio.shape[1] / vits.config.audio.sample_rate
print('Inference time: {}'.format(inference_time))
print('Real time factor: {}'.format(inference_time / audio_length))
return 16000, audio[0]
gr.Interface(
fn=text_to_speech,
inputs="text",
outputs="audio",
theme="default",
).launch(debug=False)