Spaces:
Runtime error
Runtime error
File size: 6,282 Bytes
b3385db 1c6f49f b3385db 1c6f49f 571ece7 b3385db 1c6f49f b3385db 571ece7 b3385db 2c01ee6 b3385db 1c6f49f b3385db 571ece7 1c6f49f b3385db 1c6f49f b3385db 571ece7 b3385db 571ece7 b3385db 571ece7 b3385db 1c6f49f b3385db 571ece7 b3385db 1c6f49f 571ece7 1c6f49f 571ece7 1c6f49f 571ece7 1c6f49f 571ece7 1c6f49f 571ece7 1c6f49f 571ece7 1c6f49f 571ece7 1c6f49f 571ece7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
import asyncio
import json
import logging
import math
import os
import time
from contextlib import suppress
from functools import lru_cache
import edge_tts
import gradio as gr
import httpx
import soundfile as sf
from tts_service.utils import cache_path, env_str, seconds_to_ms
from tts_service.voices import voice_manager
log = logging.getLogger(__name__)
@lru_cache(maxsize=None)
def import_voice_converter():
from rvc.infer.infer import VoiceConverter
return VoiceConverter()
# TTS
async def run_tts_script(
text: str,
voice_name: str,
rate: int = 0,
progress=gr.Progress(), # noqa: B008
) -> tuple[str, str]:
def update_progress(pct, msg) -> None:
log.debug("Progress: %.1f%%: %s", pct * 100, msg)
progress(pct, msg)
tts_start = -1.0
rvc_start = -1.0
ts0 = time.time()
update_progress(0, "Starting...")
voice = voice_manager.voices[voice_name]
text = text.strip()
output_tts_path = cache_path(voice.tts, "", rate, text, extension="mp3")
text_ptr = 0
tts_cached = os.path.exists(output_tts_path)
if not tts_cached:
log.info("Synthesizing %s chars into %s", len(text), output_tts_path)
rates = f"+{rate}%" if rate >= 0 else f"{rate}%"
communicate = edge_tts.Communicate(text, voice.tts, rate=rates)
with open(output_tts_path, "wb") as f:
async for chunk in communicate.stream():
if tts_start < 0:
tts_start = time.time()
chunk_type = chunk["type"]
if chunk_type == "audio":
f.write(chunk["data"])
elif chunk_type == "WordBoundary":
chunk_text = chunk["text"]
text_index = -1
with suppress(ValueError):
text_index = text.index(chunk_text, text_ptr)
if text_index == -1:
log.warning("Extraneous text received from edge tts: %s", chunk_text)
continue
text_ptr = text_index + len(chunk_text)
pct_complete = text_ptr / len(text)
log.debug("%.1f%%: %s", pct_complete * 100, chunk)
update_progress(pct_complete / 2, "Synthesizing...")
else:
log.warning("Unknown chunk type: %s: %s", chunk_type, json.dumps(chunk))
else:
log.info("TTS cached at %s", output_tts_path)
audio_duration = sf.info(output_tts_path).duration
expected_processing_time = audio_duration / 10 + 20 # 10x real-time on nvidia t4
ts1 = time.time()
output_rvc_path = cache_path(voice.tts, voice.name, rate, text, extension="mp3")
rvc_cached = os.path.exists(output_rvc_path)
if not rvc_cached:
log.info(
"Converting %s of audio into %s. Expected duration: %s",
seconds_to_ms(audio_duration),
output_rvc_path,
seconds_to_ms(expected_processing_time),
)
last_check = 0.0
timeout = httpx.Timeout(5, read=60.0)
endpoint_url = env_str("RVC_ENDPOINT")
async with httpx.AsyncClient(timeout=timeout) as client:
response = await client.post(f"{endpoint_url}/v1/rvc", content=output_tts_path.read_bytes())
rvc_start = time.time()
response.raise_for_status()
data = response.json()
log.info("Submitted for conversion: %s", data)
result_url = data["urls"]["result"]
while True:
elapsed = time.time() - ts1
rvc_elapsed = time.time() - rvc_start
proportion = elapsed / expected_processing_time
pct_complete = 0.5 + math.tanh(proportion) / 2
update_progress(pct_complete, "Processing...")
if rvc_elapsed > 0.8 * expected_processing_time and elapsed - last_check > 10:
last_check = elapsed
response = await client.get(result_url)
content_type = response.headers.get("Content-Type")
processed_bytes = await response.aread()
log.info(f"Checking status: %s (%s) {len(processed_bytes):,} bytes", response.status_code, content_type)
if response.status_code == 200 and content_type == "audio/mpeg":
output_rvc_path.write_bytes(processed_bytes)
break
elif response.status_code != 404:
response.raise_for_status()
await asyncio.sleep(0.1)
log.info("Successfully converted text (%s chars) -> %s", len(text), output_rvc_path)
else:
log.info("Already converted: %s", output_rvc_path)
def format_duration(duration: float) -> str:
return "Cached" if duration < 1 else seconds_to_ms(duration)
def format_wpm(duration: float) -> str:
return "Cached" if duration < 1 else f"{word_count * 60 / duration:,.0f}"
def format_rate(duration: float) -> str:
return "Cached" if duration < 1 else f"{audio_duration / duration:.1f}x"
def format_latency(latency: float) -> str:
return "N/A" if latency < 1 else f"{latency:.2f}s"
ts2 = time.time()
total_time = ts2 - ts0
rvc_time = ts2 - rvc_start if rvc_start > 0 else 0
tts_time = ts1 - tts_start if tts_start > 0 else 0
word_count = len(text.split())
durations = (audio_duration, total_time, tts_time, rvc_time)
times = " | ".join(format_duration(t) for t in durations)
wpms = " | ".join(format_wpm(t) for t in durations)
rates = " | ".join(format_rate(t) for t in durations)
latencies = " | ".join(format_latency(latency) for latency in (0, 0, tts_start - ts0, rvc_start - ts1))
rvc_cost = "N/A" if rvc_cached else f"{rvc_time * 0.0164:.1f}¢"
markdown_status = f"""
Audio successfully synthesized.
| | Words | Chars | Cost |
|-----|------:|------:|-----:|
|Count|{word_count:,}|{len(text):,}|{rvc_cost}|
| |Actual|Processing|TTS|RVC|
|-----|-----:|---------:|--:|--:|
|Time|{times}|
|WPM|{wpms}|
|Rate|{rates}|
|Latency|{latencies}|
""".strip()
log.info(markdown_status)
return markdown_status, str(output_rvc_path)
|