File size: 6,282 Bytes
b3385db
 
 
1c6f49f
b3385db
1c6f49f
571ece7
b3385db
 
 
 
1c6f49f
 
b3385db
571ece7
b3385db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2c01ee6
b3385db
 
 
1c6f49f
b3385db
 
 
571ece7
 
 
1c6f49f
b3385db
 
 
1c6f49f
b3385db
571ece7
 
 
b3385db
 
 
 
571ece7
 
 
b3385db
 
 
 
 
571ece7
 
 
b3385db
 
 
 
 
 
1c6f49f
b3385db
 
571ece7
 
b3385db
1c6f49f
571ece7
1c6f49f
571ece7
1c6f49f
571ece7
 
 
 
 
 
 
 
1c6f49f
571ece7
1c6f49f
 
 
571ece7
1c6f49f
 
 
 
 
571ece7
 
1c6f49f
 
 
571ece7
1c6f49f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
571ece7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import asyncio
import json
import logging
import math
import os
import time
from contextlib import suppress
from functools import lru_cache

import edge_tts
import gradio as gr
import httpx
import soundfile as sf

from tts_service.utils import cache_path, env_str, seconds_to_ms
from tts_service.voices import voice_manager

log = logging.getLogger(__name__)


@lru_cache(maxsize=None)
def import_voice_converter():
    from rvc.infer.infer import VoiceConverter

    return VoiceConverter()


# TTS
async def run_tts_script(
    text: str,
    voice_name: str,
    rate: int = 0,
    progress=gr.Progress(),  # noqa: B008
) -> tuple[str, str]:
    def update_progress(pct, msg) -> None:
        log.debug("Progress: %.1f%%: %s", pct * 100, msg)
        progress(pct, msg)

    tts_start = -1.0
    rvc_start = -1.0
    ts0 = time.time()
    update_progress(0, "Starting...")
    voice = voice_manager.voices[voice_name]

    text = text.strip()
    output_tts_path = cache_path(voice.tts, "", rate, text, extension="mp3")
    text_ptr = 0
    tts_cached = os.path.exists(output_tts_path)
    if not tts_cached:
        log.info("Synthesizing %s chars into %s", len(text), output_tts_path)
        rates = f"+{rate}%" if rate >= 0 else f"{rate}%"
        communicate = edge_tts.Communicate(text, voice.tts, rate=rates)
        with open(output_tts_path, "wb") as f:
            async for chunk in communicate.stream():
                if tts_start < 0:
                    tts_start = time.time()

                chunk_type = chunk["type"]
                if chunk_type == "audio":
                    f.write(chunk["data"])
                elif chunk_type == "WordBoundary":
                    chunk_text = chunk["text"]
                    text_index = -1
                    with suppress(ValueError):
                        text_index = text.index(chunk_text, text_ptr)
                    if text_index == -1:
                        log.warning("Extraneous text received from edge tts: %s", chunk_text)
                        continue
                    text_ptr = text_index + len(chunk_text)
                    pct_complete = text_ptr / len(text)
                    log.debug("%.1f%%: %s", pct_complete * 100, chunk)
                    update_progress(pct_complete / 2, "Synthesizing...")
                else:
                    log.warning("Unknown chunk type: %s: %s", chunk_type, json.dumps(chunk))
    else:
        log.info("TTS cached at %s", output_tts_path)

    audio_duration = sf.info(output_tts_path).duration
    expected_processing_time = audio_duration / 10 + 20  # 10x real-time on nvidia t4

    ts1 = time.time()
    output_rvc_path = cache_path(voice.tts, voice.name, rate, text, extension="mp3")
    rvc_cached = os.path.exists(output_rvc_path)
    if not rvc_cached:
        log.info(
            "Converting %s of audio into %s. Expected duration: %s",
            seconds_to_ms(audio_duration),
            output_rvc_path,
            seconds_to_ms(expected_processing_time),
        )
        last_check = 0.0
        timeout = httpx.Timeout(5, read=60.0)
        endpoint_url = env_str("RVC_ENDPOINT")
        async with httpx.AsyncClient(timeout=timeout) as client:
            response = await client.post(f"{endpoint_url}/v1/rvc", content=output_tts_path.read_bytes())
            rvc_start = time.time()
            response.raise_for_status()
            data = response.json()
            log.info("Submitted for conversion: %s", data)
            result_url = data["urls"]["result"]
            while True:
                elapsed = time.time() - ts1
                rvc_elapsed = time.time() - rvc_start
                proportion = elapsed / expected_processing_time
                pct_complete = 0.5 + math.tanh(proportion) / 2
                update_progress(pct_complete, "Processing...")
                if rvc_elapsed > 0.8 * expected_processing_time and elapsed - last_check > 10:
                    last_check = elapsed
                    response = await client.get(result_url)
                    content_type = response.headers.get("Content-Type")
                    processed_bytes = await response.aread()
                    log.info(f"Checking status: %s (%s) {len(processed_bytes):,} bytes", response.status_code, content_type)
                    if response.status_code == 200 and content_type == "audio/mpeg":
                        output_rvc_path.write_bytes(processed_bytes)
                        break
                    elif response.status_code != 404:
                        response.raise_for_status()
                await asyncio.sleep(0.1)
        log.info("Successfully converted text (%s chars) -> %s", len(text), output_rvc_path)
    else:
        log.info("Already converted: %s", output_rvc_path)

    def format_duration(duration: float) -> str:
        return "Cached" if duration < 1 else seconds_to_ms(duration)

    def format_wpm(duration: float) -> str:
        return "Cached" if duration < 1 else f"{word_count * 60 / duration:,.0f}"

    def format_rate(duration: float) -> str:
        return "Cached" if duration < 1 else f"{audio_duration / duration:.1f}x"

    def format_latency(latency: float) -> str:
        return "N/A" if latency < 1 else f"{latency:.2f}s"

    ts2 = time.time()
    total_time = ts2 - ts0
    rvc_time = ts2 - rvc_start if rvc_start > 0 else 0
    tts_time = ts1 - tts_start if tts_start > 0 else 0
    word_count = len(text.split())
    durations = (audio_duration, total_time, tts_time, rvc_time)
    times = " | ".join(format_duration(t) for t in durations)
    wpms = " | ".join(format_wpm(t) for t in durations)
    rates = " | ".join(format_rate(t) for t in durations)
    latencies = " | ".join(format_latency(latency) for latency in (0, 0, tts_start - ts0, rvc_start - ts1))
    rvc_cost = "N/A" if rvc_cached else f"{rvc_time * 0.0164:.1f}¢"
    markdown_status = f"""
Audio successfully synthesized.

|     | Words | Chars | Cost |
|-----|------:|------:|-----:|
|Count|{word_count:,}|{len(text):,}|{rvc_cost}|

|     |Actual|Processing|TTS|RVC|
|-----|-----:|---------:|--:|--:|
|Time|{times}|
|WPM|{wpms}|
|Rate|{rates}|
|Latency|{latencies}|

""".strip()
    log.info(markdown_status)
    return markdown_status, str(output_rvc_path)