jlopez00's picture
Upload folder using huggingface_hub
2c01ee6 verified
raw
history blame
3.88 kB
import asyncio
import json
import logging
import os
from functools import lru_cache
import edge_tts
import gradio as gr
from tts_service.utils import cache_path
from tts_service.voices import voice_manager
log = logging.getLogger(__name__)
@lru_cache(maxsize=None)
def import_voice_converter():
from rvc.infer.infer import VoiceConverter
return VoiceConverter()
# TTS
async def run_tts_script(
text: str,
voice_name: str,
rate: int = 0,
progress=gr.Progress(), # noqa: B008
) -> tuple[str, str]:
async def update_progress(pct, msg) -> None:
log.debug("Progress: %.1f%%: %s", pct * 100, msg)
progress(pct, msg)
await asyncio.sleep(0)
log.info("Synthesizing text (%s chars)", len(text))
await update_progress(0, "Starting...")
voice = voice_manager.voices[voice_name]
format = "wav"
text = text.strip()
output_tts_path = cache_path(voice.tts, "", rate, text, extension=format)
text_ptr = 0
if not os.path.exists(output_tts_path):
rates = f"+{rate}%" if rate >= 0 else f"{rate}%"
communicate = edge_tts.Communicate(text, voice.tts, rate=rates)
with open(output_tts_path, "wb") as f:
async for chunk in communicate.stream():
chunk_type = chunk["type"]
if chunk_type == "audio":
f.write(chunk["data"])
elif chunk_type == "WordBoundary":
chunk_text = chunk["text"]
text_index = text.index(chunk_text, text_ptr)
if text_index == -1:
log.warning("Extraneous text received from edge tts: %s", chunk_text)
continue
text_ptr = text_index + len(chunk_text)
pct_complete = text_ptr / len(text)
log.debug("%.1f%%: %s", pct_complete * 100, chunk)
await update_progress(pct_complete / 2, "Synthesizing...")
else:
log.warning("Unknown chunk type: %s: %s", chunk_type, json.dumps(chunk))
output_rvc_path = cache_path(voice.tts, voice.name, rate, text, extension=format)
if not os.path.exists(output_rvc_path):
infer_pipeline = import_voice_converter()
await infer_pipeline.convert_audio(
pitch=voice.pitch,
filter_radius=voice.filter_radius,
index_rate=voice.index_rate,
volume_envelope=voice.rms_mix_rate,
protect=voice.protect,
hop_length=voice.hop_length,
f0_method=voice.f0_method,
audio_input_path=str(output_tts_path),
audio_output_path=str(output_rvc_path),
model_path=voice.model,
index_path=voice.index,
split_audio=True,
f0_autotune=voice.autotune is not None,
f0_autotune_strength=voice.autotune,
clean_audio=voice.clean is not None,
clean_strength=voice.clean,
export_format=format.upper(),
upscale_audio=voice.upscale,
f0_file=None,
embedder_model=voice.embedder_model,
embedder_model_custom=None,
sid=0,
formant_shifting=None,
formant_qfrency=None,
formant_timbre=None,
post_process=None,
reverb=None,
pitch_shift=None,
limiter=None,
gain=None,
distortion=None,
chorus=None,
bitcrush=None,
clipping=None,
compressor=None,
delay=None,
sliders=None,
callback=lambda pct: update_progress(0.5 + pct / 2, "Converting..."),
)
log.info("Successfully synthesized text (%s chars)", len(text))
return "Text synthesized successfully.", str(output_rvc_path)
# Prerequisites