from logging import getLogger import gradio as gr import numpy as np import numpy.typing as npt from en_tts import Synthesizer, Transcriber FLOAT32_64_MIN_WAV = -1.0 FLOAT32_64_MAX_WAV = 1.0 INT16_MIN = np.iinfo(np.int16).min # -32768 = -(2**15) INT16_MAX = np.iinfo(np.int16).max # 32767 = 2**15 - 1 INT32_MIN = np.iinfo(np.int32).min # -2147483648 = -(2**31) INT32_MAX = np.iinfo(np.int32).max # 2147483647 = 2**31 - 1 logger = getLogger(__name__) logger.info("Initializing transcriber...") transcriber = Transcriber() logger.info("Initializing synthesizer...") synthesizer = Synthesizer() def synt(text: str) -> str: logger.info("Transcribing...") text_ipa = transcriber.transcribe_to_ipa(text) logger.info("Synthesizing...") audio = synthesizer.synthesize(text_ipa) audio_int = convert_wav(audio, np.int16) return 22050, audio_int def get_max_value(dtype): # see wavfile.write() max positive eg. on 16-bit PCM is 32767 if dtype == np.int16: return INT16_MAX if dtype == np.int32: return INT32_MAX if dtype in (np.float32, np.float64): return FLOAT32_64_MAX_WAV assert False def get_min_value(dtype): if dtype == np.int16: return INT16_MIN if dtype == np.int32: return INT32_MIN if dtype in (np.float32, np.float64): return FLOAT32_64_MIN_WAV assert False def convert_wav(wav: npt.NDArray[np.float64], to_dtype): ''' if the wav is over-amplified the result will also be over-amplified. ''' if wav.dtype != to_dtype: wav = wav / (-1 * get_min_value(wav.dtype)) * get_max_value(to_dtype) if to_dtype in (np.int16, np.int32): # the default seems to be np.fix instead of np.round on wav.astype() wav = np.round(wav, 0) wav = wav.astype(to_dtype) return wav example_text = "When the sunlight strikes raindrops in the air, they act as a prism and form a rainbow." iface = gr.Interface( fn=synt, inputs=[gr.Textbox(example_text, label="Text")], outputs=[gr.Audio(type="numpy", label="Speech", autoplay=True)], ) iface.launch()