import asyncio import json import logging import os from functools import lru_cache import edge_tts import gradio as gr from tts_service.utils import cache_path from tts_service.voices import voice_manager log = logging.getLogger(__name__) @lru_cache(maxsize=None) def import_voice_converter(): from rvc.infer.infer import VoiceConverter return VoiceConverter() # TTS async def run_tts_script( text: str, voice_name: str, rate: int = 0, progress=gr.Progress(), # noqa: B008 ) -> tuple[str, str]: async def update_progress(pct, msg) -> None: log.debug("Progress: %.1f%%: %s", pct * 100, msg) progress(pct, msg) await asyncio.sleep(0) log.info("Synthesizing text (%s chars)", len(text)) await update_progress(0, "Starting...") voice = voice_manager.voices[voice_name] format = "wav" text = text.strip() output_tts_path = cache_path(voice.tts, "", rate, text, extension=format) text_ptr = 0 if not os.path.exists(output_tts_path): rates = f"+{rate}%" if rate >= 0 else f"{rate}%" communicate = edge_tts.Communicate(text, voice.tts, rate=rates) with open(output_tts_path, "wb") as f: async for chunk in communicate.stream(): chunk_type = chunk["type"] if chunk_type == "audio": f.write(chunk["data"]) elif chunk_type == "WordBoundary": chunk_text = chunk["text"] text_index = text.index(chunk_text, text_ptr) if text_index == -1: log.warning("Extraneous text received from edge tts: %s", chunk_text) continue text_ptr = text_index + len(chunk_text) pct_complete = text_ptr / len(text) log.debug("%.1f%%: %s", pct_complete * 100, chunk) await update_progress(pct_complete / 2, "Synthesizing...") else: log.warning("Unknown chunk type: %s: %s", chunk_type, json.dumps(chunk)) output_rvc_path = cache_path(voice.tts, voice.name, rate, text, extension=format) if not os.path.exists(output_rvc_path): infer_pipeline = import_voice_converter() await infer_pipeline.convert_audio( pitch=voice.pitch, filter_radius=voice.filter_radius, index_rate=voice.index_rate, volume_envelope=voice.rms_mix_rate, protect=voice.protect, hop_length=voice.hop_length, f0_method=voice.f0_method, audio_input_path=str(output_tts_path), audio_output_path=str(output_rvc_path), model_path=voice.model, index_path=voice.index, split_audio=True, f0_autotune=voice.autotune is not None, f0_autotune_strength=voice.autotune, clean_audio=voice.clean is not None, clean_strength=voice.clean, export_format=format.upper(), upscale_audio=voice.upscale, f0_file=None, embedder_model=voice.embedder_model, embedder_model_custom=None, sid=0, formant_shifting=None, formant_qfrency=None, formant_timbre=None, post_process=None, reverb=None, pitch_shift=None, limiter=None, gain=None, distortion=None, chorus=None, bitcrush=None, clipping=None, compressor=None, delay=None, sliders=None, callback=lambda pct: update_progress(0.5 + pct / 2, "Converting..."), ) log.info("Successfully synthesized text (%s chars)", len(text)) return "Text synthesized successfully.", str(output_rvc_path) # Prerequisites