Spaces:
Running
Running
from logging import getLogger | |
import gradio as gr | |
import numpy as np | |
import numpy.typing as npt | |
from en_tts import Synthesizer, Transcriber | |
FLOAT32_64_MIN_WAV = -1.0 | |
FLOAT32_64_MAX_WAV = 1.0 | |
INT16_MIN = np.iinfo(np.int16).min # -32768 = -(2**15) | |
INT16_MAX = np.iinfo(np.int16).max # 32767 = 2**15 - 1 | |
INT32_MIN = np.iinfo(np.int32).min # -2147483648 = -(2**31) | |
INT32_MAX = np.iinfo(np.int32).max # 2147483647 = 2**31 - 1 | |
logger = getLogger(__name__) | |
logger.info("Initializing transcriber...") | |
transcriber = Transcriber() | |
logger.info("Initializing synthesizer...") | |
synthesizer = Synthesizer() | |
def synt(text: str) -> str: | |
logger.info("Transcribing...") | |
text_ipa = transcriber.transcribe_to_ipa(text) | |
logger.info("Synthesizing...") | |
audio = synthesizer.synthesize(text_ipa) | |
audio_int = convert_wav(audio, np.int16) | |
return 22050, audio_int | |
def get_max_value(dtype): | |
# see wavfile.write() max positive eg. on 16-bit PCM is 32767 | |
if dtype == np.int16: | |
return INT16_MAX | |
if dtype == np.int32: | |
return INT32_MAX | |
if dtype in (np.float32, np.float64): | |
return FLOAT32_64_MAX_WAV | |
assert False | |
def get_min_value(dtype): | |
if dtype == np.int16: | |
return INT16_MIN | |
if dtype == np.int32: | |
return INT32_MIN | |
if dtype in (np.float32, np.float64): | |
return FLOAT32_64_MIN_WAV | |
assert False | |
def convert_wav(wav: npt.NDArray[np.float64], to_dtype): | |
''' | |
if the wav is over-amplified the result will also be over-amplified. | |
''' | |
if wav.dtype != to_dtype: | |
wav = wav / (-1 * get_min_value(wav.dtype)) * get_max_value(to_dtype) | |
if to_dtype in (np.int16, np.int32): | |
# the default seems to be np.fix instead of np.round on wav.astype() | |
wav = np.round(wav, 0) | |
wav = wav.astype(to_dtype) | |
return wav | |
example_text = "When the sunlight strikes raindrops in the air, they act as a prism and form a rainbow." | |
iface = gr.Interface( | |
fn=synt, | |
inputs=[gr.Textbox(example_text, label="Text")], | |
outputs=[gr.Audio(type="numpy", label="Speech", autoplay=True)], | |
) | |
iface.launch() | |