Spaces:
Running
Running
File size: 16,385 Bytes
54de0ce 81d64f0 98f1356 81d64f0 a126d00 73d590c 54de0ce fb43b4e 54de0ce cf663b6 fb43b4e cf663b6 fb780a9 54de0ce 5e6bdd8 ebc0696 54de0ce cf663b6 fb780a9 cf663b6 fb43b4e fb780a9 54de0ce ebc0696 f5915fd 635f007 300dc16 635f007 1567c2d d430de8 a520615 85fcd3c 73dbaa9 a520615 641afe5 635f007 1c22955 6729e8e 895e9c4 d41c3ed e2e4977 4028449 50a9d0f 6eb9ea3 b21342c 62d3fef 5782b8e e27b102 138b27f fbe2075 85fcd3c e27b102 62de84c 5782b8e e27b102 a5cfbc2 6729e8e e1d1d80 6729e8e 6440f80 e2e4977 138b27f efc3af1 fbe2075 3ecf742 00bdfef 6440f80 efc3af1 00bdfef efc3af1 6440f80 00bdfef efc3af1 6440f80 00bdfef 4a6da64 6729e8e ffcd9ff 6729e8e 6440f80 6729e8e 4028449 d430de8 138b27f fbe2075 3ecf742 6440f80 4a6da64 6440f80 d430de8 635f007 d41c3ed 5782b8e 04c285e d430de8 a14be06 0f9837b ee5c009 5708f2f ee5c009 d41c3ed d430de8 d41c3ed 5782b8e d41c3ed 112493b a5cfbc2 4b8ade9 8dc53c9 76a6bb1 8dc53c9 4b8ade9 04c285e d3f7823 641afe5 f5915fd 635f007 4028449 dc8b787 50a9d0f e931762 769b7d2 635f007 304fce9 f1cae8b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 |
INTROTXT = """#
**a Large Scale Model with a whole new pre-processing pipeline, added features and also some tweaks to the architecture is being trained right now.**
_______________________________________________________________________________________________________________________________________________________________
kudos to mrfakename for the base gradio code I'm borrowing here.
**ๆฅๆฌ่ช็จ**
You will probably experience slight artifacts at the beginning or at the end of the output, which is not there on my server.
Unfortunately, due to the variation in how floating-point operations are performed across different devices,
and given the intrinsic characteristics of models that incorporate diffusion components,
it is unlikely that you will achieve identical results to those obtained on my server, where the model was originally trained.
So, the output you're about to hear may not accurately reflect the true performance of the model.
it is also not limited to the artifacts, even the prosody and natural-ness of the speech is affected.
by [Soshyant](https://twitter.com/MystiqCaleid)
**NOTE: Punctuations are important!**
=========
้ณๅฃฐใฎ้ๅงๆใพใใฏ็ตไบๆใซใใใจใใจๅญๅจใใชใใฃใใฏใใฎใขใผใใฃใใกใฏใใใใใใง็บ็ใใๅฏ่ฝๆงใใใใพใใ
ๆฎๅฟตใชใใใ็ฐใชใใใใคในใงๆตฎๅๅฐๆฐ็นๆผ็ฎใ็ฐใชใๆนๆณใง่กใใใใใใใใใณDiffusionใณใณใใผใใณใใๅใๅ
ฅใใใขใใซใฎๅบๆใฎ็นๆงใ่ๆ
ฎใใใจใ
ใขใใซใๅ
ใ
ใใฌใผใใณใฐใใใใใใคในใงๅพใใใ็ตๆใจๅใ็ตๆใๅพใใใจใฏ้ฃใใใงใใใใ
ใใฎ็ตๆใไปฅไธใงไฝ้จใใใใใฉใผใใณในใฏใขใใซใฎ็ใฎๆง่ฝใๆญฃ็ขบใซๅๆ ใใฆใใพใใใ
ใใฎใใใใขใผใใฃใใกใฏใใฎๅ้กใ ใใงใฏใชใใใใใฅใฉใซใในใ้ณๅฃฐใฏใชใชใใฃใผใซใๅใณใพใใ
**NOTE: ๅฅ่ชญ็นใฏใจใฆใๅคงไบใงใ!**
"""
import gradio as gr
import random
import styletts2importable
import ljspeechimportable
import torch
import os
from txtsplit import txtsplit
import numpy as np
import pickle
theme = gr.themes.Base(
font=[gr.themes.GoogleFont('Libre Franklin'), gr.themes.GoogleFont('Public Sans'), 'system-ui', 'sans-serif'],
)
from Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule
voicelist = ['1','2','3']
voices = {}
# import phonemizer
# global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)
# todo: cache computed style, load using pickle
# if os.path.exists('voices.pkl'):
# with open('voices.pkl', 'rb') as f:
# voices = pickle.load(f)
# else:
for v in voicelist:
voices[v] = styletts2importable.compute_style(f'voices/{v}.wav')
# def synthesize(text, voice, multispeakersteps):
# if text.strip() == "":
# raise gr.Error("You must enter some text")
# # if len(global_phonemizer.phonemize([text])) > 300:
# if len(text) > 300:
# raise gr.Error("Text must be under 300 characters")
# v = voice.lower()
# # return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=7, embedding_scale=1))
# return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=multispeakersteps, embedding_scale=1))
if not torch.cuda.is_available(): INTROTXT += "\n\n### on CPU, it'll run rather slower, but not too much."
def synthesize(text, voice, lngsteps,embscale,alpha, beta, password, progress=gr.Progress()):
if text.strip() == "":
raise gr.Error("You must enter some text")
if len(text) > 50000:
raise gr.Error("Text must be <50k characters")
print("*** saying ***")
print(text)
print("*** end ***")
texts = txtsplit(text)
v = voice.lower()
audios = []
for t in progress.tqdm(texts):
print(t)
audios.append(styletts2importable.inference(t, voices[v], alpha=alpha, beta=beta, diffusion_steps=lngsteps, embedding_scale=embscale))
return (24000, np.concatenate(audios))
# def longsynthesize(text, voice, lngsteps, password, progress=gr.Progress()):
# if password == os.environ['ACCESS_CODE']:
# if text.strip() == "":
# raise gr.Error("You must enter some text")
# if lngsteps > 25:
# raise gr.Error("Max 25 steps")
# if lngsteps < 5:
# raise gr.Error("Min 5 steps")
# texts = split_and_recombine_text(text)
# v = voice.lower()
# audios = []
# for t in progress.tqdm(texts):
# audios.append(styletts2importable.inference(t, voices[v], alpha=0.3, beta=0.7, diffusion_steps=lngsteps, embedding_scale=1))
# return (24000, np.concatenate(audios))
# else:
# raise gr.Error('Wrong access code')
def clsynthesize(text, voice, vcsteps, embscale, alpha, beta, progress=gr.Progress()):
torch.manual_seed(0)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
random.seed(0)
# if text.strip() == "":
# raise gr.Error("You must enter some text")
# # if global_phonemizer.phonemize([text]) > 300:
# if len(text) > 400:
# raise gr.Error("Text must be under 400 characters")
# # return (24000, styletts2importable.inference(text, styletts2importable.compute_style(voice), alpha=0.3, beta=0.7, diffusion_steps=20, embedding_scale=1))
# return (24000, styletts2importable.inference(text, styletts2importable.compute_style(voice), alpha=0.3, beta=0.7, diffusion_steps=vcsteps, embedding_scale=1))
if text.strip() == "":
raise gr.Error("You must enter some text")
if len(text) > 50000:
raise gr.Error("Text must be <50k characters")
if embscale > 1.3 and len(text) < 20:
gr.Warning("WARNING: You entered short text, you may get static!")
print("*** saying ***")
print(text)
print("*** end ***")
texts = txtsplit(text)
audios = []
# vs = styletts2importable.compute_style(voice)
# print(vs)
for t in progress.tqdm(texts):
audios.append(styletts2importable.inference(t, voices[v], alpha=alpha, beta=beta, diffusion_steps=vcsteps, embedding_scale=embscale))
# audios.append(styletts2importable.inference(t, vs, diffusion_steps=10, alpha=0.3, beta=0.7, embedding_scale=5))
return (24000, np.concatenate(audios))
def ljsynthesize(text, steps,embscale, progress=gr.Progress()):
torch.manual_seed(0)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
random.seed(0)
# if text.strip() == "":
# raise gr.Error("You must enter some text")
# # if global_phonemizer.phonemize([text]) > 300:
# if len(text) > 400:
# raise gr.Error("Text must be under 400 characters")
noise = torch.tanh(torch.randn(1,1,256).to('cuda' if torch.cuda.is_available() else 'cpu'))
# return (24000, Text-guided Inferenceimportable.inference(text, noise, diffusion_steps=7, embedding_scale=1))
if text.strip() == "":
raise gr.Error("You must enter some text")
if len(text) > 150000:
raise gr.Error("Text must be <150k characters")
print("*** saying ***")
print(text)
print("*** end ***")
texts = txtsplit(text)
audios = []
for t in progress.tqdm(texts):
audios.append(ljspeechimportable.inference(t, noise, diffusion_steps=steps, embedding_scale=embscale))
return (24000, np.concatenate(audios))
# with gr.Blocks() as vctk:
# with gr.Row():
# with gr.Column(scale=1):
# clinp = gr.Textbox(label="Text", info="Enter the text | ใใญในใใๅ
ฅใใฆใใ ใใใ็ญใใใใจใฒใฉใใชใใพใ",value="ใใชใใใใชใใจใไธ็ใฏ่ฒ่คชใใฆ่ฆใใพใใใใชใใฎ็ฌ้กใ็งใฎๆฅใ
ใๆใใ็
งใใใฆใใพใใใใชใใใใชใๆฅใฏใใพใใงๅฌใฎใใใซๅฏใใๆใใงใ.", interactive=True)
# voice = gr.Dropdown(voicelist, label="Voice", info="Select a default voice.", interactive=True)
# vcsteps = gr.Slider(minimum=3, maximum=20, value=5, step=1, label="Diffusion Steps", info="You'll get more variation in the results if you increase it, doesn't necessarily improve anything.| ใใใไธใใใใใฃใจใจใขใผใทใงใใซใช้ณๅฃฐใซใชใใพใ๏ผไธใใใใใฎ้๏ผใๅขใใใใใใจใ ใใซใชใใฎใงใใๆณจๆใใ ใใ", interactive=True)
# embscale = gr.Slider(minimum=1, maximum=10, value=1.8, step=0.1, label="Embedding Scale (READ WARNING BELOW)", info="ใใใไธใใใใใฃใจใจใขใผใทใงใใซใช้ณๅฃฐใซใชใใพใ๏ผไธใใใใใฎ้๏ผใๅขใใใใใใจใ ใใซใชใใฎใงใใๆณจๆใใ ใใ", interactive=True)
# alpha = gr.Slider(minimum=0, maximum=1, value=0.3, step=0.1, label="Alpha", interactive=True)
# beta = gr.Slider(minimum=0, maximum=1, value=0.4, step=0.1, label="Beta", interactive=True)
# with gr.Column(scale=1):
# clbtn = gr.Button("Synthesize", variant="primary")
# claudio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_progress_color': '#3C82F6'})
# clbtn.click(clsynthesize, inputs=[clinp, voice, vcsteps, embscale, alpha, beta], outputs=[claudio], concurrency_limit=4)
with gr.Blocks() as vctk:
with gr.Row():
with gr.Column(scale=1):
inp = gr.Textbox(label="Text", info="Enter the text | ใใญในใใๅ
ฅใใฆใใ ใใใ็ญใใใใจใฒใฉใใชใใพใ.", value="้ทใ้ใ็ใใใจใฎ็นใใใฏใ็งใซใจใฃใฆใใใใใฎใชใใใฎใงใใใ่จ่ใไบคใใใ่ใใๅ
ฑๆใใไบใใฎไธ็ใๅฐใใใค็่งฃใใฆใใใใฎๆ้ใฏใ็งใฎไบบ็ใซใใใฆ้ๅธธใซ้่ฆใชใใฎใงใ.", interactive=True)
voice = gr.Dropdown(voicelist, label="Voice", info="Select a default voice.", value='3', interactive=True)
embscale = gr.Slider(minimum=1, maximum=10, value=1, step=0.1, label="Embedding Scale (READ WARNING BELOW)", info="ใใใไธใใใใใฃใจใจใขใผใทใงใใซใช้ณๅฃฐใซใชใใพใ๏ผไธใใใใใฎ้๏ผใๅขใใใใใใจใ ใใซใชใใฎใงใใๆณจๆใใ ใใ", interactive=True)
alpha = gr.Slider(minimum=0, maximum=1, value=0.3, step=0.1, label="Alpha",info ="Closer to 0 means similar to the Audio Reference | ใผใญใซ่ฟใใปใฉใ้ณๅฃฐใตใณใใซใฎในใฟใคใซใซ่ฟใ้ณๅฃฐใซใชใใพใ", interactive=True)
beta = gr.Slider(minimum=0, maximum=1, value=0.5, step=0.1, label="Beta",info ="alphaใจๅใใ", interactive=True)
multispeakersteps = gr.Slider(minimum=3, maximum=15, value=3, step=1, label="Diffusion Steps", interactive=True)
# use_gruut = gr.Checkbox(label="Use alternate phonemizer (Gruut) - Experimental")
with gr.Column(scale=1):
btn = gr.Button("Synthesize", variant="primary")
audio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_progress_color': '#3C82F6'})
btn.click(synthesize, inputs=[inp, voice, multispeakersteps,embscale,alpha,beta], outputs=[audio], concurrency_limit=4)
# with gr.Blocks() as clone:
# with gr.Row():
# with gr.Column(scale=1):
# clinp = gr.Textbox(label="Text", info="Enter the text | ใใญในใใๅ
ฅใใฆใใ ใใใ็ญใใใใจใฒใฉใใชใใพใ", interactive=True)
# clvoice = gr.Audio(label="Voice", interactive=True, type='filepath', max_length=300, waveform_options={'waveform_progress_color': '#3C82F6'})
# vcsteps = gr.Slider(minimum=3, maximum=10, value=2, step=1, label="Diffusion Steps", info="ใใใไธใใใใใฃใจใจใขใผใทใงใใซใช้ณๅฃฐใซใชใใพใ๏ผไธใใใใใฎ้๏ผใๅขใใใใใใจใ ใใซใชใใฎใงใใๆณจๆใใ ใใ", interactive=True)
# embscale = gr.Slider(minimum=1, maximum=10, value=1, step=0.1, label="Embedding Scale (READ WARNING BELOW)", info="Defaults to 1. WARNING: If you set this too high and generate text that's too short you will get static!", interactive=True)
# alpha = gr.Slider(minimum=0, maximum=1, value=0.3, step=0.1, label="Alpha", info="Defaults to 0.3", interactive=True)
# beta = gr.Slider(minimum=0, maximum=1, value=0.7, step=0.1, label="Beta", info="Defaults to 0.7", interactive=True)
# with gr.Column(scale=1):
# clbtn = gr.Button("Synthesize", variant="primary")
# claudio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_progress_color': '#3C82F6'})
# clbtn.click(clsynthesize, inputs=[clinp, clvoice, vcsteps, embscale, alpha, beta], outputs=[claudio], concurrency_limit=4)
# with gr.Blocks() as longText:
# with gr.Row():
# with gr.Column(scale=1):
# lnginp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True)
# lngvoice = gr.Dropdown(voicelist, label="Voice", info="Select a default voice.", value='m-us-1', interactive=True)
# lngsteps = gr.Slider(minimum=5, maximum=25, value=10, step=1, label="Diffusion Steps", info="Higher = better quality, but slower", interactive=True)
# lngpwd = gr.Textbox(label="Access code", info="This feature is in beta. You need an access code to use it as it uses more resources and we would like to prevent abuse")
# with gr.Column(scale=1):
# lngbtn = gr.Button("Synthesize", variant="primary")
# lngaudio = gr.Audio(interactive=False, label="Synthesized Audio")
# lngbtn.click(longsynthesize, inputs=[lnginp, lngvoice, lngsteps, lngpwd], outputs=[lngaudio], concurrency_limit=4)
with gr.Blocks() as lj:
with gr.Row():
with gr.Column(scale=1):
ljinp = gr.Textbox(label="Text", info="Enter the text | ใใญในใใๅ
ฅใใฆใใ ใใใ็ญใใใใจใฒใฉใใชใใพใ.", interactive=True, value="ใใชใใใใชใใจใไธ็ใฏ่ฒ่คชใใฆ่ฆใใพใใใใชใใฎ็ฌ้กใ็งใฎๆฅใ
ใๆใใ็
งใใใฆใใพใใใใชใใใใชใๆฅใฏใใพใใงๅฌใฎใใใซๅฏใใๆใใงใ.")
embscale = gr.Slider(minimum=1, maximum=3, value=1.1, step=0.1, label="Embedding Scale (READ WARNING BELOW)", info="ใใใไธใใใใใฃใจใจใขใผใทใงใใซใช้ณๅฃฐใซใชใใพใ๏ผไธใใใใใฎ้๏ผใๅขใใใใใใจใ ใใซใชใใฎใงใใๆณจๆใใ ใใ(1ใใ - 2ใพใงใฎ็ฏๅฒใงๆ้ฉใ)", interactive=True)
ljsteps = gr.Slider(minimum=3, maximum=20, value=5, step=1, label="Diffusion Steps", info="You'll get more variation in the results if you increase it, doesn't necessarily improve anything.| ใใใไธใใใใใฃใจใจใขใผใทใงใใซใช้ณๅฃฐใซใชใใพใ๏ผไธใใใใใฎ้๏ผใๅขใใใใใใจใ ใใซใชใใฎใงใใๆณจๆใใ ใใ", interactive=True)
with gr.Column(scale=1):
ljbtn = gr.Button("Synthesize", variant="primary")
ljaudio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_progress_color': '#3C82F6'})
ljbtn.click(ljsynthesize, inputs=[ljinp, ljsteps, embscale], outputs=[ljaudio], concurrency_limit=4)
with gr.Blocks(title="StyleTTS 2", css="footer{display:none !important}", theme="NoCrypt/miku") as demo:
gr.Markdown(INTROTXT)
gr.DuplicateButton("Duplicate Space")
# gr.TabbedInterface([vctk, clone, lj, longText], ['Multi-Voice', 'Voice Cloning', 'Text-guided Inference', 'Long Text [Beta]'])
gr.TabbedInterface([lj, vctk], ['|Text-guided Inference|','With Reference Audio(Not Optimized)','Text-guided Inference', 'Long Text [Beta]'])
gr.Markdown("""
the base code was borrowed from -> [mrfakename](https://twitter.com/realmrfakename). Neither of us are affiliated with the StyleTTS 2's authors.
""") # Please do not remove this line.
if __name__ == "__main__":
# demo.queue(api_open=False, max_size=15).launch(show_api=False)
demo.queue(api_open=False, max_size=15).launch(show_api=False)
|