File size: 2,138 Bytes
a31f9ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
from logging import getLogger

import gradio as gr
import numpy as np
import numpy.typing as npt
from en_tts import Synthesizer, Transcriber

FLOAT32_64_MIN_WAV = -1.0
FLOAT32_64_MAX_WAV = 1.0
INT16_MIN = np.iinfo(np.int16).min  # -32768 = -(2**15)
INT16_MAX = np.iinfo(np.int16).max  # 32767 = 2**15 - 1
INT32_MIN = np.iinfo(np.int32).min  # -2147483648 = -(2**31)
INT32_MAX = np.iinfo(np.int32).max  # 2147483647 = 2**31 - 1

logger = getLogger(__name__)
logger.info("Initializing transcriber...")
transcriber = Transcriber()

logger.info("Initializing synthesizer...")
synthesizer = Synthesizer()


def synt(text: str) -> str:
    logger.info("Transcribing...")
    text_ipa = transcriber.transcribe_to_ipa(text)

    logger.info("Synthesizing...")
    audio = synthesizer.synthesize(text_ipa)
    audio_int = convert_wav(audio, np.int16)
    return 22050, audio_int


def get_max_value(dtype):
    # see wavfile.write() max positive eg. on 16-bit PCM is 32767
    if dtype == np.int16:
        return INT16_MAX

    if dtype == np.int32:
        return INT32_MAX

    if dtype in (np.float32, np.float64):
        return FLOAT32_64_MAX_WAV

    assert False


def get_min_value(dtype):
    if dtype == np.int16:
        return INT16_MIN

    if dtype == np.int32:
        return INT32_MIN

    if dtype in (np.float32, np.float64):
        return FLOAT32_64_MIN_WAV

    assert False


def convert_wav(wav: npt.NDArray[np.float64], to_dtype):
    '''
    if the wav is over-amplified the result will also be over-amplified.
    '''
    if wav.dtype != to_dtype:
        wav = wav / (-1 * get_min_value(wav.dtype)) * get_max_value(to_dtype)
        if to_dtype in (np.int16, np.int32):
            # the default seems to be np.fix instead of np.round on wav.astype()
            wav = np.round(wav, 0)
        wav = wav.astype(to_dtype)

    return wav


example_text = "When the sunlight strikes raindrops in the air, they act as a prism and form a rainbow."

iface = gr.Interface(
    fn=synt,
    inputs=[gr.Textbox(example_text, label="Text")],
    outputs=[gr.Audio(type="numpy", label="Speech", autoplay=True)],
)
iface.launch()