Text_to_Speech / app.py
Rozinamax's picture
Update app.py
20d4ba4
import os
import torch
import gradio as gr
import torchaudio
import time
from datetime import datetime
from tortoise.api import TextToSpeech
from tortoise.utils.audio import load_voice, load_voices
VOICE_OPTIONS = [
"angie",
"applejack",
"atkins",
"barack_obama",
"daniel",
"daws",
"deniro",
"dortice",
"dreams",
"emma",
"empire",
"freeman",
"geralt",
"grace",
"halle",
"jane_eyre",
"jlaw",
"kennard",
"lescault",
"lj",
"mol",
"mouse",
"myself",
"pat",
"pat2",
"rainbow",
"sanjita",
"snakes",
"tim_reynolds",
"tom",
"weaver",
"william",
"random",
]
def inference(
text,
voice,
Emotion,
Preset,
):
texts = [text]
Angry_tone = "[I am so angry]"
Sad_tone = "[I am so sad]"
Happy_tone = "[I am so happy]"
Scared_tone = "[I am so scared]"
if Emotion == "Angry":
text = Angry_tone + text
if Emotion == "Sad":
text = Sad_tone + text
if Emotion == "Happy":
text = Happy_tone + text
if Emotion == "Scared":
text = Scared_tone + text
voices = [voice]
if len(voices) == 1:
voice_samples, conditioning_latents = load_voice(voice)
else:
voice_samples, conditioning_latents = load_voices(voices)
audio_frames = []
for j, text in enumerate(texts):
for audio_frame in tts.tts_with_preset(
text,
voice_samples=voice_samples,
conditioning_latents=conditioning_latents,
preset=Preset,
k=1
):
audio_frames.append(torch.from_numpy(audio_frame.cpu().detach().numpy()))
complete_audio = torch.cat(audio_frames, dim=0)
yield (24000, complete_audio.numpy())
def main():
title = "TTS "
text = gr.Textbox(
lines=4,
label="Text:",
)
voice = gr.Dropdown(
VOICE_OPTIONS, value="jane_eyre", label="Select voice:", type="value"
)
Emotion = gr.Radio(
["Angry", "Sad", "Happy", "Scared"],
type="value",
)
Preset = gr.Radio(
["ultra_fast", "fast", "standard", "high_quality"],
type="value",
value="ultra_fast",
)
output_audio = gr.Audio(label="streaming audio:", streaming=True, autoplay=True)
interface = gr.Interface(
fn=inference,
inputs=[
text,
voice,
Emotion,
Preset,
],
title=title,
outputs=[output_audio],
)
interface.queue().launch()
if __name__ == "__main__":
tts = TextToSpeech(kv_cache=True, use_deepspeed=True, half=True)
with open("Tortoise_TTS_Runs_Scripts.log", "a") as f:
f.write(
f"\n\n-------------------------Tortoise TTS Scripts Logs, {datetime.now()}-------------------------\n"
)
main()