|
import asyncio |
|
import base64 |
|
import time |
|
import uuid |
|
import shutil |
|
from concurrent.futures import ThreadPoolExecutor |
|
from pathlib import Path |
|
from typing import List, Optional |
|
import subprocess |
|
|
|
import ebooklib |
|
import gradio as gr |
|
import torch |
|
import torchaudio |
|
from ebooklib import epub |
|
from bs4 import BeautifulSoup |
|
|
|
from auralis import TTS, TTSRequest, TTSOutput, AudioPreprocessingConfig, setup_logger |
|
|
|
logger = setup_logger(__file__) |
|
|
|
tts = TTS() |
|
model_path = "AstraMindAI/xttsv2" |
|
gpt_model = "AstraMindAI/xtts2-gpt" |
|
try: |
|
tts = tts.from_pretrained(model_path, gpt_model=gpt_model) |
|
logger.info(f"Successfully loaded model {model_path}") |
|
except Exception as e: |
|
logger.error(f"Failed to load model: {e}. Ensure that the model exists at {model_path}") |
|
|
|
|
|
temp_dir = Path("/tmp/auralis") |
|
temp_dir.mkdir(exist_ok=True) |
|
|
|
def convert_ebook_to_txt(input_path: str) -> str: |
|
""" |
|
Convert any ebook format to txt using calibre's ebook-convert |
|
Returns the path to the converted txt file |
|
""" |
|
output_path = str(temp_dir / f"{uuid.uuid4().hex[:8]}.txt") |
|
try: |
|
subprocess.run(['ebook-convert', input_path, output_path], |
|
check=True, capture_output=True, text=True) |
|
return output_path |
|
except subprocess.CalledProcessError as e: |
|
logger.error(f"Conversion failed: {e.stderr}") |
|
raise RuntimeError(f"Failed to convert ebook: {e.stderr}") |
|
|
|
def shorten_filename(original_path: str) -> str: |
|
"""Copies the given file to a temporary directory with a shorter, random filename.""" |
|
ext = Path(original_path).suffix |
|
short_name = "file_" + uuid.uuid4().hex[:8] + ext |
|
short_path = temp_dir / short_name |
|
shutil.copyfile(original_path, short_path) |
|
return str(short_path) |
|
|
|
def text_from_file(file_path: str) -> str: |
|
"""Read text from a file, converting if necessary.""" |
|
file_ext = Path(file_path).suffix.lower() |
|
|
|
if file_ext in ['.txt']: |
|
with open(file_path, 'r', encoding='utf-8') as f: |
|
return f.read() |
|
else: |
|
|
|
txt_path = convert_ebook_to_txt(file_path) |
|
with open(txt_path, 'r', encoding='utf-8') as f: |
|
return f.read() |
|
|
|
def clone_voice(audio_path: str): |
|
"""Clone a voice from an audio path.""" |
|
audio_short_path = shorten_filename(audio_path) |
|
with open(audio_short_path, "rb") as f: |
|
audio_data = base64.b64encode(f.read()).decode('utf-8') |
|
return audio_data |
|
|
|
def process_text_and_generate(input_text, ref_audio_files, speed, enhance_speech, temperature, top_p, top_k, repetition_penalty, language, *args): |
|
"""Process text and generate audio.""" |
|
log_messages = "" |
|
if not ref_audio_files: |
|
log_messages += "Please provide at least one reference audio!\n" |
|
return None, log_messages |
|
|
|
|
|
base64_voices = ref_audio_files[:5] |
|
|
|
request = TTSRequest( |
|
text=input_text, |
|
speaker_files=base64_voices, |
|
stream=False, |
|
enhance_speech=enhance_speech, |
|
temperature=temperature, |
|
top_p=top_p, |
|
top_k=top_k, |
|
repetition_penalty=repetition_penalty, |
|
language=language, |
|
) |
|
|
|
try: |
|
with torch.no_grad(): |
|
output = tts.generate_speech(request) |
|
if output: |
|
if speed != 1: |
|
output.change_speed(speed) |
|
log_messages += f"β
Successfully Generated audio\n" |
|
return (output.sample_rate, output.array), log_messages |
|
else: |
|
log_messages += "β No output was generated. Check that the model was correctly loaded\n" |
|
return None, log_messages |
|
except Exception as e: |
|
logger.error(f"Error: {e}") |
|
log_messages += f"β An Error occured: {e}\n" |
|
return None, log_messages |
|
|
|
def build_gradio_ui(): |
|
"""Builds and launches the Gradio UI for Auralis.""" |
|
with gr.Blocks(title="Auralis TTS Demo", theme="soft") as ui: |
|
gr.Markdown( |
|
""" |
|
# Auralis Text-to-Speech Demo π |
|
Convert text or ebooks to speech with advanced voice cloning and enhancement. |
|
""" |
|
) |
|
|
|
with gr.Tab("File to Speech"): |
|
with gr.Row(): |
|
with gr.Column(): |
|
file_input = gr.File( |
|
label="Upload Book/Text File", |
|
file_types=[ |
|
".txt", ".epub", ".mobi", ".azw3", ".fb2", |
|
".htmlz", ".lit", ".pdb", ".pdf", ".rtf" |
|
] |
|
) |
|
ref_audio_files = gr.Files( |
|
label="Reference Audio Files", |
|
file_types=["audio"] |
|
) |
|
with gr.Accordion("Advanced settings", open=False): |
|
speed = gr.Slider( |
|
label="Playback speed", |
|
minimum=0.5, |
|
maximum=2.0, |
|
value=1.0, |
|
step=0.1 |
|
) |
|
enhance_speech = gr.Checkbox( |
|
label="Enhance Reference Speech", |
|
value=False |
|
) |
|
temperature = gr.Slider( |
|
label="Temperature", |
|
minimum=0.5, |
|
maximum=1.0, |
|
value=0.75, |
|
step=0.05 |
|
) |
|
top_p = gr.Slider( |
|
label="Top P", |
|
minimum=0.5, |
|
maximum=1.0, |
|
value=0.85, |
|
step=0.05 |
|
) |
|
top_k = gr.Slider( |
|
label="Top K", |
|
minimum=0, |
|
maximum=100, |
|
value=50, |
|
step=10 |
|
) |
|
repetition_penalty = gr.Slider( |
|
label="Repetition penalty", |
|
minimum=1.0, |
|
maximum=10.0, |
|
value=5.0, |
|
step=0.5 |
|
) |
|
language = gr.Dropdown( |
|
label="Target Language", |
|
choices=[ |
|
"en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", |
|
"nl", "cs", "ar", "zh-cn", "hu", "ko", "ja", "hi", "auto", |
|
], |
|
value="auto" |
|
) |
|
generate_button = gr.Button("Generate Speech") |
|
with gr.Column(): |
|
audio_output = gr.Audio(label="Generated Audio") |
|
log_output = gr.Text(label="Log Output") |
|
|
|
def process_file_and_generate( |
|
file_input, ref_audio_files, speed, enhance_speech, |
|
temperature, top_p, top_k, repetition_penalty, language |
|
): |
|
if not file_input: |
|
return None, "Please provide an input file!" |
|
|
|
try: |
|
|
|
input_text = text_from_file(file_input.name) |
|
|
|
return process_text_and_generate( |
|
input_text, ref_audio_files, speed, enhance_speech, |
|
temperature, top_p, top_k, repetition_penalty, language |
|
) |
|
except Exception as e: |
|
logger.error(f"Error processing file: {e}") |
|
return None, f"Error processing file: {str(e)}" |
|
|
|
generate_button.click( |
|
process_file_and_generate, |
|
inputs=[ |
|
file_input, ref_audio_files, speed, enhance_speech, |
|
temperature, top_p, top_k, repetition_penalty, language |
|
], |
|
outputs=[audio_output, log_output], |
|
) |
|
|
|
with gr.Tab("Clone With Microphone"): |
|
with gr.Row(): |
|
with gr.Column(): |
|
file_input_mic = gr.File( |
|
label="Upload Book/Text File", |
|
file_types=[ |
|
".txt", ".epub", ".mobi", ".azw3", ".fb2", |
|
".htmlz", ".lit", ".pdb", ".pdf", ".rtf" |
|
] |
|
) |
|
mic_ref_audio = gr.Audio( |
|
label="Record Reference Audio", |
|
sources=["microphone"] |
|
) |
|
|
|
with gr.Accordion("Advanced settings", open=False): |
|
speed_mic = gr.Slider( |
|
label="Playback speed", |
|
minimum=0.5, |
|
maximum=2.0, |
|
value=1.0, |
|
step=0.1 |
|
) |
|
enhance_speech_mic = gr.Checkbox( |
|
label="Enhance Reference Speech", |
|
value=True |
|
) |
|
temperature_mic = gr.Slider( |
|
label="Temperature", |
|
minimum=0.5, |
|
maximum=1.0, |
|
value=0.75, |
|
step=0.05 |
|
) |
|
top_p_mic = gr.Slider( |
|
label="Top P", |
|
minimum=0.5, |
|
maximum=1.0, |
|
value=0.85, |
|
step=0.05 |
|
) |
|
top_k_mic = gr.Slider( |
|
label="Top K", |
|
minimum=0, |
|
maximum=100, |
|
value=50, |
|
step=10 |
|
) |
|
repetition_penalty_mic = gr.Slider( |
|
label="Repetition penalty", |
|
minimum=1.0, |
|
maximum=10.0, |
|
value=5.0, |
|
step=0.5 |
|
) |
|
language_mic = gr.Dropdown( |
|
label="Target Language", |
|
choices=[ |
|
"en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", |
|
"nl", "cs", "ar", "zh-cn", "hu", "ko", "ja", "hi", "auto", |
|
], |
|
value="auto" |
|
) |
|
generate_button_mic = gr.Button("Generate Speech") |
|
with gr.Column(): |
|
audio_output_mic = gr.Audio(label="Generated Audio") |
|
log_output_mic = gr.Text(label="Log Output") |
|
|
|
def process_mic_and_generate( |
|
file_input, mic_ref_audio, speed_mic, enhance_speech_mic, |
|
temperature_mic, top_p_mic, top_k_mic, repetition_penalty_mic, language_mic |
|
): |
|
if not mic_ref_audio: |
|
return None, "Please record an audio!" |
|
if not file_input: |
|
return None, "Please provide an input file!" |
|
|
|
try: |
|
|
|
input_text = text_from_file(file_input.name) |
|
|
|
|
|
data = str(time.time()).encode("utf-8") |
|
hash = hashlib.sha1(data).hexdigest()[:10] |
|
output_path = temp_dir / (f"mic_{hash}.wav") |
|
|
|
torch_audio = torch.from_numpy(mic_ref_audio[1].astype(float)) |
|
torchaudio.save( |
|
str(output_path), |
|
torch_audio.unsqueeze(0), |
|
mic_ref_audio[0] |
|
) |
|
|
|
return process_text_and_generate( |
|
input_text, [Path(output_path)], speed_mic, |
|
enhance_speech_mic, temperature_mic, top_p_mic, |
|
top_k_mic, repetition_penalty_mic, language_mic |
|
) |
|
except Exception as e: |
|
logger.error(f"Error processing input: {e}") |
|
return None, f"Error processing input: {str(e)}" |
|
|
|
generate_button_mic.click( |
|
process_mic_and_generate, |
|
inputs=[ |
|
file_input_mic, mic_ref_audio, speed_mic, |
|
enhance_speech_mic, temperature_mic, top_p_mic, |
|
top_k_mic, repetition_penalty_mic, language_mic |
|
], |
|
outputs=[audio_output_mic, log_output_mic], |
|
) |
|
|
|
return ui |
|
|
|
if __name__ == "__main__": |
|
ui = build_gradio_ui() |
|
ui.launch(debug=True, server_name="0.0.0.0", server_port=7860) |