File size: 4,287 Bytes
b3385db
 
 
1c6f49f
b3385db
1c6f49f
b3385db
 
 
 
1c6f49f
 
b3385db
1c6f49f
b3385db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2c01ee6
b3385db
 
 
1c6f49f
b3385db
 
 
2c01ee6
 
1c6f49f
b3385db
 
 
1c6f49f
b3385db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1c6f49f
b3385db
 
 
1c6f49f
 
 
 
 
b3385db
1c6f49f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b3385db
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import asyncio
import json
import logging
import math
import os
import time
from functools import lru_cache

import edge_tts
import gradio as gr
import httpx
import soundfile as sf

from tts_service.utils import cache_path, env_str
from tts_service.voices import voice_manager

log = logging.getLogger(__name__)


@lru_cache(maxsize=None)
def import_voice_converter():
    from rvc.infer.infer import VoiceConverter

    return VoiceConverter()


# TTS
async def run_tts_script(
    text: str,
    voice_name: str,
    rate: int = 0,
    progress=gr.Progress(),  # noqa: B008
) -> tuple[str, str]:
    def update_progress(pct, msg) -> None:
        log.debug("Progress: %.1f%%: %s", pct * 100, msg)
        progress(pct, msg)

    log.info("Synthesizing text (%s chars)", len(text))

    update_progress(0, "Starting...")
    voice = voice_manager.voices[voice_name]

    text = text.strip()
    output_tts_path = cache_path(voice.tts, "", rate, text, extension="mp3")
    text_ptr = 0
    if not os.path.exists(output_tts_path):
        rates = f"+{rate}%" if rate >= 0 else f"{rate}%"
        communicate = edge_tts.Communicate(text, voice.tts, rate=rates)
        with open(output_tts_path, "wb") as f:
            async for chunk in communicate.stream():
                chunk_type = chunk["type"]
                if chunk_type == "audio":
                    f.write(chunk["data"])
                elif chunk_type == "WordBoundary":
                    chunk_text = chunk["text"]
                    text_index = text.index(chunk_text, text_ptr)
                    if text_index == -1:
                        log.warning("Extraneous text received from edge tts: %s", chunk_text)
                        continue
                    text_ptr = text_index + len(chunk_text)
                    pct_complete = text_ptr / len(text)
                    log.debug("%.1f%%: %s", pct_complete * 100, chunk)
                    update_progress(pct_complete / 2, "Synthesizing...")
                else:
                    log.warning("Unknown chunk type: %s: %s", chunk_type, json.dumps(chunk))

    audio_duration = sf.info(output_tts_path).duration
    expected_processing_time = audio_duration / 8 + 10  # 10x real-time on nvidia t4
    log.info(f"Synthesized {audio_duration:,.0f}s, expected processing time: {expected_processing_time:,.0f}s")

    output_rvc_path = cache_path(voice.tts, voice.name, rate, text, extension="mp3")
    if not os.path.exists(output_rvc_path):
        ts0 = time.time()
        last_check = 0.0
        timeout = httpx.Timeout(5, read=15.0)
        endpoint_url = env_str("RVC_ENDPOINT")
        async with httpx.AsyncClient(timeout=timeout) as client:
            response = await client.post(f"{endpoint_url}/v1/rvc", content=output_tts_path.read_bytes())
            response.raise_for_status()
            data = response.json()
            log.info("Submitted for conversion: %s", data)
            result_url = data["urls"]["result"]
            while True:
                elapsed = time.time() - ts0
                proportion = elapsed / expected_processing_time
                pct_complete = 0.5 + math.tanh(proportion) / 2
                update_progress(pct_complete, "Processing...")
                if elapsed > 0.8 * expected_processing_time and elapsed - last_check > 10:
                    last_check = elapsed
                    response = await client.get(result_url)
                    content_type = response.headers.get("Content-Type")
                    processed_bytes = await response.aread()
                    log.info(f"Checking status: %s (%s) {len(processed_bytes):,} bytes", response.status_code, content_type)
                    if response.status_code == 200 and content_type == "audio/mpeg":
                        output_rvc_path.write_bytes(processed_bytes)
                        break
                    elif response.status_code != 404:
                        response.raise_for_status()
                await asyncio.sleep(0.1)
        log.info("Successfully converted text (%s chars) -> %s", len(text), output_rvc_path)
    else:
        log.info("Already converted: %s", output_rvc_path)

    return f"{audio_duration:,.0f}s of audio successfully synthesized.", str(output_rvc_path)


# Prerequisites