Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,28 +1,18 @@
|
|
1 |
INTROTXT = """# StyleTTS 2
|
2 |
kudos to mrfakename for the base gradio code I'm borrowing here.
|
3 |
-
|
4 |
-
|
5 |
ๆฅๆฌ่ช็จ
|
6 |
-
|
7 |
You will probably experience slight artifacts at the beginning or at the end of the output, which is not there on my server.
|
8 |
-
|
9 |
Unfortunately, due to the variation in how floating-point operations are performed across different devices,
|
10 |
and given the intrinsic characteristics of models that incorporate diffusion components,
|
11 |
it is unlikely that you will achieve identical results to those obtained on my server, where the model was originally trained.
|
12 |
So, the output you're about to hear may not accurately reflect the true performance of the model.
|
13 |
it is also not limited to the artifacts, even the prosody and natural-ness of the speech is affected.
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
=========
|
18 |
-
|
19 |
้ณๅฃฐใฎ้ๅงๆใพใใฏ็ตไบๆใซใใใจใใจๅญๅจใใชใใฃใใฏใใฎใขใผใใฃใใกใฏใใใใใใง็บ็ใใๅฏ่ฝๆงใใใใพใใ
|
20 |
-
|
21 |
ๆฎๅฟตใชใใใ็ฐใชใใใใคในใงๆตฎๅๅฐๆฐ็นๆผ็ฎใ็ฐใชใๆนๆณใง่กใใใใใใใใใณDiffusionใณใณใใผใใณใใๅใๅ
ฅใใใขใใซใฎๅบๆใฎ็นๆงใ่ๆ
ฎใใใจใ
|
22 |
ใขใใซใๅ
ใ
ใใฌใผใใณใฐใใใใใใคในใงๅพใใใ็ตๆใจๅใ็ตๆใๅพใใใจใฏ้ฃใใใงใใใใ
|
23 |
ใใฎ็ตๆใไปฅไธใงไฝ้จใใใใใฉใผใใณในใฏใขใใซใฎ็ใฎๆง่ฝใๆญฃ็ขบใซๅๆ ใใฆใใพใใใ
|
24 |
ใใฎใใใใขใผใใฃใใกใฏใใฎๅ้กใ ใใงใฏใชใใใใใฅใฉใซใในใ้ณๅฃฐใฏใชใชใใฃใผใซใๅใณใพใใ
|
25 |
-
|
26 |
**
|
27 |
"""
|
28 |
import gradio as gr
|
@@ -59,7 +49,7 @@ for v in voicelist:
|
|
59 |
# # return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=7, embedding_scale=1))
|
60 |
# return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=multispeakersteps, embedding_scale=1))
|
61 |
if not torch.cuda.is_available(): INTROTXT += "\n\n### on CPU, it'll run rather slower, but not too much."
|
62 |
-
def synthesize(text, voice,
|
63 |
if text.strip() == "":
|
64 |
raise gr.Error("You must enter some text")
|
65 |
if len(text) > 50000:
|
@@ -72,7 +62,7 @@ def synthesize(text, voice,embscale,alpha,beta, lngsteps, progress=gr.Progress()
|
|
72 |
audios = []
|
73 |
for t in progress.tqdm(texts):
|
74 |
print(t)
|
75 |
-
audios.append(styletts2importable.inference(t, voices[v], alpha=
|
76 |
return (24000, np.concatenate(audios))
|
77 |
# def longsynthesize(text, voice, lngsteps, password, progress=gr.Progress()):
|
78 |
# if password == os.environ['ACCESS_CODE']:
|
@@ -108,14 +98,18 @@ def clsynthesize(text, voice, vcsteps, embscale, alpha, beta, progress=gr.Progre
|
|
108 |
print(text)
|
109 |
print("*** end ***")
|
110 |
texts = txtsplit(text)
|
|
|
111 |
audios = []
|
112 |
# vs = styletts2importable.compute_style(voice)
|
113 |
-
|
114 |
# print(vs)
|
115 |
for t in progress.tqdm(texts):
|
116 |
-
audios.append(styletts2importable.inference(t,
|
117 |
# audios.append(styletts2importable.inference(t, vs, diffusion_steps=10, alpha=0.3, beta=0.7, embedding_scale=5))
|
118 |
return (24000, np.concatenate(audios))
|
|
|
|
|
|
|
119 |
def ljsynthesize(text, steps,embscale, progress=gr.Progress()):
|
120 |
# if text.strip() == "":
|
121 |
# raise gr.Error("You must enter some text")
|
@@ -141,23 +135,22 @@ def ljsynthesize(text, steps,embscale, progress=gr.Progress()):
|
|
141 |
with gr.Blocks() as vctk:
|
142 |
with gr.Row():
|
143 |
with gr.Column(scale=1):
|
144 |
-
|
145 |
-
voice = gr.Dropdown(voicelist, label="Voice", info="Select a default voice.",
|
146 |
-
|
147 |
-
|
148 |
-
alpha = gr.Slider(minimum=0, maximum=1, value=0.3, step=0.1, label="Alpha",
|
149 |
-
beta = gr.Slider(minimum=0, maximum=1, value=0.
|
150 |
-
# use_gruut = gr.Checkbox(label="Use alternate phonemizer (Gruut) - Experimental")
|
151 |
with gr.Column(scale=1):
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
with gr.Blocks() as clone:
|
156 |
with gr.Row():
|
157 |
with gr.Column(scale=1):
|
158 |
clinp = gr.Textbox(label="Text", info="Enter the text | ใใญในใใๅ
ฅใใฆใใ ใใใ็ญใใใใจใฒใฉใใชใใพใ", interactive=True)
|
159 |
clvoice = gr.Audio(label="Voice", interactive=True, type='filepath', max_length=300, waveform_options={'waveform_progress_color': '#3C82F6'})
|
160 |
-
vcsteps = gr.Slider(minimum=3, maximum=
|
161 |
embscale = gr.Slider(minimum=1, maximum=10, value=1, step=0.1, label="Embedding Scale (READ WARNING BELOW)", info="Defaults to 1. WARNING: If you set this too high and generate text that's too short you will get static!", interactive=True)
|
162 |
alpha = gr.Slider(minimum=0, maximum=1, value=0.3, step=0.1, label="Alpha", info="Defaults to 0.3", interactive=True)
|
163 |
beta = gr.Slider(minimum=0, maximum=1, value=0.7, step=0.1, label="Beta", info="Defaults to 0.7", interactive=True)
|
@@ -196,5 +189,4 @@ the base code was borrowed from -> [mrfakename](https://twitter.com/realmrfakena
|
|
196 |
""") # Please do not remove this line.
|
197 |
if __name__ == "__main__":
|
198 |
# demo.queue(api_open=False, max_size=15).launch(show_api=False)
|
199 |
-
demo.queue(api_open=False, max_size=15).launch(show_api=False)
|
200 |
-
|
|
|
1 |
INTROTXT = """# StyleTTS 2
|
2 |
kudos to mrfakename for the base gradio code I'm borrowing here.
|
|
|
|
|
3 |
ๆฅๆฌ่ช็จ
|
|
|
4 |
You will probably experience slight artifacts at the beginning or at the end of the output, which is not there on my server.
|
|
|
5 |
Unfortunately, due to the variation in how floating-point operations are performed across different devices,
|
6 |
and given the intrinsic characteristics of models that incorporate diffusion components,
|
7 |
it is unlikely that you will achieve identical results to those obtained on my server, where the model was originally trained.
|
8 |
So, the output you're about to hear may not accurately reflect the true performance of the model.
|
9 |
it is also not limited to the artifacts, even the prosody and natural-ness of the speech is affected.
|
|
|
|
|
|
|
10 |
=========
|
|
|
11 |
้ณๅฃฐใฎ้ๅงๆใพใใฏ็ตไบๆใซใใใจใใจๅญๅจใใชใใฃใใฏใใฎใขใผใใฃใใกใฏใใใใใใง็บ็ใใๅฏ่ฝๆงใใใใพใใ
|
|
|
12 |
ๆฎๅฟตใชใใใ็ฐใชใใใใคในใงๆตฎๅๅฐๆฐ็นๆผ็ฎใ็ฐใชใๆนๆณใง่กใใใใใใใใใณDiffusionใณใณใใผใใณใใๅใๅ
ฅใใใขใใซใฎๅบๆใฎ็นๆงใ่ๆ
ฎใใใจใ
|
13 |
ใขใใซใๅ
ใ
ใใฌใผใใณใฐใใใใใใคในใงๅพใใใ็ตๆใจๅใ็ตๆใๅพใใใจใฏ้ฃใใใงใใใใ
|
14 |
ใใฎ็ตๆใไปฅไธใงไฝ้จใใใใใฉใผใใณในใฏใขใใซใฎ็ใฎๆง่ฝใๆญฃ็ขบใซๅๆ ใใฆใใพใใใ
|
15 |
ใใฎใใใใขใผใใฃใใกใฏใใฎๅ้กใ ใใงใฏใชใใใใใฅใฉใซใในใ้ณๅฃฐใฏใชใชใใฃใผใซใๅใณใพใใ
|
|
|
16 |
**
|
17 |
"""
|
18 |
import gradio as gr
|
|
|
49 |
# # return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=7, embedding_scale=1))
|
50 |
# return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=multispeakersteps, embedding_scale=1))
|
51 |
if not torch.cuda.is_available(): INTROTXT += "\n\n### on CPU, it'll run rather slower, but not too much."
|
52 |
+
def synthesize(text, voice, lngsteps, password, progress=gr.Progress()):
|
53 |
if text.strip() == "":
|
54 |
raise gr.Error("You must enter some text")
|
55 |
if len(text) > 50000:
|
|
|
62 |
audios = []
|
63 |
for t in progress.tqdm(texts):
|
64 |
print(t)
|
65 |
+
audios.append(styletts2importable.inference(t, voices[v], alpha=0.3, beta=0.7, diffusion_steps=lngsteps, embedding_scale=1))
|
66 |
return (24000, np.concatenate(audios))
|
67 |
# def longsynthesize(text, voice, lngsteps, password, progress=gr.Progress()):
|
68 |
# if password == os.environ['ACCESS_CODE']:
|
|
|
98 |
print(text)
|
99 |
print("*** end ***")
|
100 |
texts = txtsplit(text)
|
101 |
+
|
102 |
audios = []
|
103 |
# vs = styletts2importable.compute_style(voice)
|
104 |
+
|
105 |
# print(vs)
|
106 |
for t in progress.tqdm(texts):
|
107 |
+
audios.append(styletts2importable.inference(t, voices[v], alpha=alpha, beta=beta, diffusion_steps=vcsteps, embedding_scale=embscale))
|
108 |
# audios.append(styletts2importable.inference(t, vs, diffusion_steps=10, alpha=0.3, beta=0.7, embedding_scale=5))
|
109 |
return (24000, np.concatenate(audios))
|
110 |
+
|
111 |
+
|
112 |
+
|
113 |
def ljsynthesize(text, steps,embscale, progress=gr.Progress()):
|
114 |
# if text.strip() == "":
|
115 |
# raise gr.Error("You must enter some text")
|
|
|
135 |
with gr.Blocks() as vctk:
|
136 |
with gr.Row():
|
137 |
with gr.Column(scale=1):
|
138 |
+
clinp = gr.Textbox(label="Text", info="Enter the text | ใใญในใใๅ
ฅใใฆใใ ใใใ็ญใใใใจใฒใฉใใชใใพใ",value="ใใชใใใใชใใจใไธ็ใฏ่ฒ่คชใใฆ่ฆใใพใใใใชใใฎ็ฌ้กใ็งใฎๆฅใ
ใๆใใ็
งใใใฆใใพใใใใชใใใใชใๆฅใฏใใพใใงๅฌใฎใใใซๅฏใใๆใใงใ." interactive=True)
|
139 |
+
voice = gr.Dropdown(voicelist, label="Voice", info="Select a default voice.", interactive=True)
|
140 |
+
vcsteps = gr.Slider(minimum=3, maximum=20, value=5, step=1, label="Diffusion Steps", info="You'll get more variation in the results if you increase it, doesn't necessarily improve anything.| ใใใไธใใใใใฃใจใจใขใผใทใงใใซใช้ณๅฃฐใซใชใใพใ๏ผไธใใใใใฎ้๏ผใๅขใใใใใใจใ ใใซใชใใฎใงใใๆณจๆใใ ใใ", interactive=True)
|
141 |
+
embscale = gr.Slider(minimum=1, maximum=10, value=1.8, step=0.1, label="Embedding Scale (READ WARNING BELOW)", info="Defaults to 1. WARNING: If you set this too high and generate text that's too short you will get static!", interactive=True)
|
142 |
+
alpha = gr.Slider(minimum=0, maximum=1, value=0.3, step=0.1, label="Alpha", interactive=True)
|
143 |
+
beta = gr.Slider(minimum=0, maximum=1, value=0.4, step=0.1, label="Beta", interactive=True)
|
|
|
144 |
with gr.Column(scale=1):
|
145 |
+
clbtn = gr.Button("Synthesize", variant="primary")
|
146 |
+
claudio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_progress_color': '#3C82F6'})
|
147 |
+
clbtn.click(clsynthesize, inputs=[clinp, voice, vcsteps, embscale, alpha, beta], outputs=[claudio], concurrency_limit=4)
|
148 |
with gr.Blocks() as clone:
|
149 |
with gr.Row():
|
150 |
with gr.Column(scale=1):
|
151 |
clinp = gr.Textbox(label="Text", info="Enter the text | ใใญในใใๅ
ฅใใฆใใ ใใใ็ญใใใใจใฒใฉใใชใใพใ", interactive=True)
|
152 |
clvoice = gr.Audio(label="Voice", interactive=True, type='filepath', max_length=300, waveform_options={'waveform_progress_color': '#3C82F6'})
|
153 |
+
vcsteps = gr.Slider(minimum=3, maximum=10, value=2, step=1, label="Diffusion Steps", info="ใใใไธใใใใใฃใจใจใขใผใทใงใใซใช้ณๅฃฐใซใชใใพใ๏ผไธใใใใใฎ้๏ผใๅขใใใใใใจใ ใใซใชใใฎใงใใๆณจๆใใ ใใ", interactive=True)
|
154 |
embscale = gr.Slider(minimum=1, maximum=10, value=1, step=0.1, label="Embedding Scale (READ WARNING BELOW)", info="Defaults to 1. WARNING: If you set this too high and generate text that's too short you will get static!", interactive=True)
|
155 |
alpha = gr.Slider(minimum=0, maximum=1, value=0.3, step=0.1, label="Alpha", info="Defaults to 0.3", interactive=True)
|
156 |
beta = gr.Slider(minimum=0, maximum=1, value=0.7, step=0.1, label="Beta", info="Defaults to 0.7", interactive=True)
|
|
|
189 |
""") # Please do not remove this line.
|
190 |
if __name__ == "__main__":
|
191 |
# demo.queue(api_open=False, max_size=15).launch(show_api=False)
|
192 |
+
demo.queue(api_open=False, max_size=15).launch(show_api=False,share=True)
|
|