Spaces:

jlopez00
/

tts-service

Runtime error

App Files Files Community

tts-service / tts_service /tts.py

jlopez00

Upload folder using huggingface_hub

571ece7 verified 18 days ago

raw

history blame

6.28 kB

	import asyncio
	import json
	import logging
	import math
	import os
	import time
	from contextlib import suppress
	from functools import lru_cache

	import edge_tts
	import gradio as gr
	import httpx
	import soundfile as sf

	from tts_service.utils import cache_path, env_str, seconds_to_ms
	from tts_service.voices import voice_manager

	log = logging.getLogger(__name__)


	@lru_cache(maxsize=None)
	def import_voice_converter():
	from rvc.infer.infer import VoiceConverter

	return VoiceConverter()


	# TTS
	async def run_tts_script(
	text: str,
	voice_name: str,
	rate: int = 0,
	progress=gr.Progress(), # noqa: B008
	) -> tuple[str, str]:
	def update_progress(pct, msg) -> None:
	log.debug("Progress: %.1f%%: %s", pct * 100, msg)
	progress(pct, msg)

	tts_start = -1.0
	rvc_start = -1.0
	ts0 = time.time()
	update_progress(0, "Starting...")
	voice = voice_manager.voices[voice_name]

	text = text.strip()
	output_tts_path = cache_path(voice.tts, "", rate, text, extension="mp3")
	text_ptr = 0
	tts_cached = os.path.exists(output_tts_path)
	if not tts_cached:
	log.info("Synthesizing %s chars into %s", len(text), output_tts_path)
	rates = f"+{rate}%" if rate >= 0 else f"{rate}%"
	communicate = edge_tts.Communicate(text, voice.tts, rate=rates)
	with open(output_tts_path, "wb") as f:
	async for chunk in communicate.stream():
	if tts_start < 0:
	tts_start = time.time()

	chunk_type = chunk["type"]
	if chunk_type == "audio":
	f.write(chunk["data"])
	elif chunk_type == "WordBoundary":
	chunk_text = chunk["text"]
	text_index = -1
	with suppress(ValueError):
	text_index = text.index(chunk_text, text_ptr)
	if text_index == -1:
	log.warning("Extraneous text received from edge tts: %s", chunk_text)
	continue
	text_ptr = text_index + len(chunk_text)
	pct_complete = text_ptr / len(text)
	log.debug("%.1f%%: %s", pct_complete * 100, chunk)
	update_progress(pct_complete / 2, "Synthesizing...")
	else:
	log.warning("Unknown chunk type: %s: %s", chunk_type, json.dumps(chunk))
	else:
	log.info("TTS cached at %s", output_tts_path)

	audio_duration = sf.info(output_tts_path).duration
	expected_processing_time = audio_duration / 10 + 20 # 10x real-time on nvidia t4

	ts1 = time.time()
	output_rvc_path = cache_path(voice.tts, voice.name, rate, text, extension="mp3")
	rvc_cached = os.path.exists(output_rvc_path)
	if not rvc_cached:
	log.info(
	"Converting %s of audio into %s. Expected duration: %s",
	seconds_to_ms(audio_duration),
	output_rvc_path,
	seconds_to_ms(expected_processing_time),
	)
	last_check = 0.0
	timeout = httpx.Timeout(5, read=60.0)
	endpoint_url = env_str("RVC_ENDPOINT")
	async with httpx.AsyncClient(timeout=timeout) as client:
	response = await client.post(f"{endpoint_url}/v1/rvc", content=output_tts_path.read_bytes())
	rvc_start = time.time()
	response.raise_for_status()
	data = response.json()
	log.info("Submitted for conversion: %s", data)
	result_url = data["urls"]["result"]
	while True:
	elapsed = time.time() - ts1
	rvc_elapsed = time.time() - rvc_start
	proportion = elapsed / expected_processing_time
	pct_complete = 0.5 + math.tanh(proportion) / 2
	update_progress(pct_complete, "Processing...")
	if rvc_elapsed > 0.8 * expected_processing_time and elapsed - last_check > 10:
	last_check = elapsed
	response = await client.get(result_url)
	content_type = response.headers.get("Content-Type")
	processed_bytes = await response.aread()
	log.info(f"Checking status: %s (%s) {len(processed_bytes):,} bytes", response.status_code, content_type)
	if response.status_code == 200 and content_type == "audio/mpeg":
	output_rvc_path.write_bytes(processed_bytes)
	break
	elif response.status_code != 404:
	response.raise_for_status()
	await asyncio.sleep(0.1)
	log.info("Successfully converted text (%s chars) -> %s", len(text), output_rvc_path)
	else:
	log.info("Already converted: %s", output_rvc_path)

	def format_duration(duration: float) -> str:
	return "Cached" if duration < 1 else seconds_to_ms(duration)

	def format_wpm(duration: float) -> str:
	return "Cached" if duration < 1 else f"{word_count * 60 / duration:,.0f}"

	def format_rate(duration: float) -> str:
	return "Cached" if duration < 1 else f"{audio_duration / duration:.1f}x"

	def format_latency(latency: float) -> str:
	return "N/A" if latency < 1 else f"{latency:.2f}s"

	ts2 = time.time()
	total_time = ts2 - ts0
	rvc_time = ts2 - rvc_start if rvc_start > 0 else 0
	tts_time = ts1 - tts_start if tts_start > 0 else 0
	word_count = len(text.split())
	durations = (audio_duration, total_time, tts_time, rvc_time)
	times = " \| ".join(format_duration(t) for t in durations)
	wpms = " \| ".join(format_wpm(t) for t in durations)
	rates = " \| ".join(format_rate(t) for t in durations)
	latencies = " \| ".join(format_latency(latency) for latency in (0, 0, tts_start - ts0, rvc_start - ts1))
	rvc_cost = "N/A" if rvc_cached else f"{rvc_time * 0.0164:.1f}¢"
	markdown_status = f"""
	Audio successfully synthesized.

	\| \| Words \| Chars \| Cost \|
	\|-----\|------:\|------:\|-----:\|
	\|Count\|{word_count:,}\|{len(text):,}\|{rvc_cost}\|

	\| \|Actual\|Processing\|TTS\|RVC\|
	\|-----\|-----:\|---------:\|--:\|--:\|
	\|Time\|{times}\|
	\|WPM\|{wpms}\|
	\|Rate\|{rates}\|
	\|Latency\|{latencies}\|

	""".strip()
	log.info(markdown_status)
	return markdown_status, str(output_rvc_path)