import gradio as gr import numpy as np from utils import load_model, normalize_text import time vits = load_model() def text_to_speech(text): """ Text to speech """ text = normalize_text(text) text_inputs = np.asarray( vits.tokenizer.text_to_ids(text), dtype=np.int64, )[None, :] start = time.perf_counter() audio = vits.inference_onnx(text_inputs) end = time.perf_counter() inference_time = end - start audio_length = audio.shape[1] / vits.config.audio.sample_rate print('Inference time: {}'.format(inference_time)) print('Real time factor: {}'.format(inference_time / audio_length)) return 16000, audio[0] gr.Interface( fn=text_to_speech, inputs="text", outputs="audio", theme="default", ).launch(debug=False)