Spaces:
Runtime error
Runtime error
Upload folder using huggingface_hub
Browse files- tabs/tts/tts.py +1 -1
- tabs/workflow/workflow.py +11 -3
- tts_service/tts.py +71 -16
- tts_service/utils.py +7 -0
- tts_service/voices.py +2 -19
tabs/tts/tts.py
CHANGED
@@ -47,7 +47,7 @@ def tts_tab():
|
|
47 |
label=i18n("Output Information"),
|
48 |
info=i18n("The output information will be displayed here."),
|
49 |
)
|
50 |
-
vc_output2 = gr.Audio(label=i18n("
|
51 |
|
52 |
convert_button.click(
|
53 |
fn=run_tts_script,
|
|
|
47 |
label=i18n("Output Information"),
|
48 |
info=i18n("The output information will be displayed here."),
|
49 |
)
|
50 |
+
vc_output2 = gr.Audio(label=i18n("Generated Audio"))
|
51 |
|
52 |
convert_button.click(
|
53 |
fn=run_tts_script,
|
tabs/workflow/workflow.py
CHANGED
@@ -48,28 +48,36 @@ def workflow_tab():
|
|
48 |
label=i18n("Source"),
|
49 |
info=i18n("Enter the document ID or URL."),
|
50 |
)
|
|
|
51 |
fetch_button = gr.Button(i18n("Fetch"))
|
52 |
|
53 |
text = gr.Textbox(
|
54 |
label=i18n("Text"),
|
55 |
-
|
56 |
)
|
57 |
|
58 |
voice = gr.Dropdown(
|
59 |
label=i18n("Voice"),
|
60 |
choices=voice_manager.voices.keys(),
|
61 |
value=voice_manager.voice_names[0],
|
|
|
62 |
)
|
63 |
|
64 |
synthesize_button = gr.Button(i18n("Synthesize"))
|
65 |
|
66 |
-
|
|
|
|
|
67 |
|
68 |
-
|
|
|
|
|
|
|
69 |
|
70 |
with gr.Column():
|
71 |
markdown = gr.Markdown(
|
72 |
label=i18n("Document"),
|
|
|
73 |
)
|
74 |
|
75 |
fetch_button.click(
|
|
|
48 |
label=i18n("Source"),
|
49 |
info=i18n("Enter the document ID or URL."),
|
50 |
)
|
51 |
+
|
52 |
fetch_button = gr.Button(i18n("Fetch"))
|
53 |
|
54 |
text = gr.Textbox(
|
55 |
label=i18n("Text"),
|
56 |
+
visible=False,
|
57 |
)
|
58 |
|
59 |
voice = gr.Dropdown(
|
60 |
label=i18n("Voice"),
|
61 |
choices=voice_manager.voices.keys(),
|
62 |
value=voice_manager.voice_names[0],
|
63 |
+
visible=len(voice_manager.voices) > 1,
|
64 |
)
|
65 |
|
66 |
synthesize_button = gr.Button(i18n("Synthesize"))
|
67 |
|
68 |
+
audio = gr.Audio(
|
69 |
+
label=i18n("Generated Audio"),
|
70 |
+
)
|
71 |
|
72 |
+
status = gr.Markdown(
|
73 |
+
label=i18n("Status"),
|
74 |
+
show_label=True,
|
75 |
+
)
|
76 |
|
77 |
with gr.Column():
|
78 |
markdown = gr.Markdown(
|
79 |
label=i18n("Document"),
|
80 |
+
show_label=True,
|
81 |
)
|
82 |
|
83 |
fetch_button.click(
|
tts_service/tts.py
CHANGED
@@ -4,6 +4,7 @@ import logging
|
|
4 |
import math
|
5 |
import os
|
6 |
import time
|
|
|
7 |
from functools import lru_cache
|
8 |
|
9 |
import edge_tts
|
@@ -11,7 +12,7 @@ import gradio as gr
|
|
11 |
import httpx
|
12 |
import soundfile as sf
|
13 |
|
14 |
-
from tts_service.utils import cache_path, env_str
|
15 |
from tts_service.voices import voice_manager
|
16 |
|
17 |
log = logging.getLogger(__name__)
|
@@ -35,25 +36,33 @@ async def run_tts_script(
|
|
35 |
log.debug("Progress: %.1f%%: %s", pct * 100, msg)
|
36 |
progress(pct, msg)
|
37 |
|
38 |
-
|
39 |
-
|
|
|
40 |
update_progress(0, "Starting...")
|
41 |
voice = voice_manager.voices[voice_name]
|
42 |
|
43 |
text = text.strip()
|
44 |
output_tts_path = cache_path(voice.tts, "", rate, text, extension="mp3")
|
45 |
text_ptr = 0
|
46 |
-
|
|
|
|
|
47 |
rates = f"+{rate}%" if rate >= 0 else f"{rate}%"
|
48 |
communicate = edge_tts.Communicate(text, voice.tts, rate=rates)
|
49 |
with open(output_tts_path, "wb") as f:
|
50 |
async for chunk in communicate.stream():
|
|
|
|
|
|
|
51 |
chunk_type = chunk["type"]
|
52 |
if chunk_type == "audio":
|
53 |
f.write(chunk["data"])
|
54 |
elif chunk_type == "WordBoundary":
|
55 |
chunk_text = chunk["text"]
|
56 |
-
text_index =
|
|
|
|
|
57 |
if text_index == -1:
|
58 |
log.warning("Extraneous text received from edge tts: %s", chunk_text)
|
59 |
continue
|
@@ -63,29 +72,39 @@ async def run_tts_script(
|
|
63 |
update_progress(pct_complete / 2, "Synthesizing...")
|
64 |
else:
|
65 |
log.warning("Unknown chunk type: %s: %s", chunk_type, json.dumps(chunk))
|
|
|
|
|
66 |
|
67 |
audio_duration = sf.info(output_tts_path).duration
|
68 |
-
expected_processing_time = audio_duration /
|
69 |
-
log.info(f"Synthesized {audio_duration:,.0f}s, expected processing time: {expected_processing_time:,.0f}s")
|
70 |
|
|
|
71 |
output_rvc_path = cache_path(voice.tts, voice.name, rate, text, extension="mp3")
|
72 |
-
|
73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
last_check = 0.0
|
75 |
-
timeout = httpx.Timeout(5, read=
|
76 |
endpoint_url = env_str("RVC_ENDPOINT")
|
77 |
async with httpx.AsyncClient(timeout=timeout) as client:
|
78 |
response = await client.post(f"{endpoint_url}/v1/rvc", content=output_tts_path.read_bytes())
|
|
|
79 |
response.raise_for_status()
|
80 |
data = response.json()
|
81 |
log.info("Submitted for conversion: %s", data)
|
82 |
result_url = data["urls"]["result"]
|
83 |
while True:
|
84 |
-
elapsed = time.time() -
|
|
|
85 |
proportion = elapsed / expected_processing_time
|
86 |
pct_complete = 0.5 + math.tanh(proportion) / 2
|
87 |
update_progress(pct_complete, "Processing...")
|
88 |
-
if
|
89 |
last_check = elapsed
|
90 |
response = await client.get(result_url)
|
91 |
content_type = response.headers.get("Content-Type")
|
@@ -101,7 +120,43 @@ async def run_tts_script(
|
|
101 |
else:
|
102 |
log.info("Already converted: %s", output_rvc_path)
|
103 |
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
import math
|
5 |
import os
|
6 |
import time
|
7 |
+
from contextlib import suppress
|
8 |
from functools import lru_cache
|
9 |
|
10 |
import edge_tts
|
|
|
12 |
import httpx
|
13 |
import soundfile as sf
|
14 |
|
15 |
+
from tts_service.utils import cache_path, env_str, seconds_to_ms
|
16 |
from tts_service.voices import voice_manager
|
17 |
|
18 |
log = logging.getLogger(__name__)
|
|
|
36 |
log.debug("Progress: %.1f%%: %s", pct * 100, msg)
|
37 |
progress(pct, msg)
|
38 |
|
39 |
+
tts_start = -1.0
|
40 |
+
rvc_start = -1.0
|
41 |
+
ts0 = time.time()
|
42 |
update_progress(0, "Starting...")
|
43 |
voice = voice_manager.voices[voice_name]
|
44 |
|
45 |
text = text.strip()
|
46 |
output_tts_path = cache_path(voice.tts, "", rate, text, extension="mp3")
|
47 |
text_ptr = 0
|
48 |
+
tts_cached = os.path.exists(output_tts_path)
|
49 |
+
if not tts_cached:
|
50 |
+
log.info("Synthesizing %s chars into %s", len(text), output_tts_path)
|
51 |
rates = f"+{rate}%" if rate >= 0 else f"{rate}%"
|
52 |
communicate = edge_tts.Communicate(text, voice.tts, rate=rates)
|
53 |
with open(output_tts_path, "wb") as f:
|
54 |
async for chunk in communicate.stream():
|
55 |
+
if tts_start < 0:
|
56 |
+
tts_start = time.time()
|
57 |
+
|
58 |
chunk_type = chunk["type"]
|
59 |
if chunk_type == "audio":
|
60 |
f.write(chunk["data"])
|
61 |
elif chunk_type == "WordBoundary":
|
62 |
chunk_text = chunk["text"]
|
63 |
+
text_index = -1
|
64 |
+
with suppress(ValueError):
|
65 |
+
text_index = text.index(chunk_text, text_ptr)
|
66 |
if text_index == -1:
|
67 |
log.warning("Extraneous text received from edge tts: %s", chunk_text)
|
68 |
continue
|
|
|
72 |
update_progress(pct_complete / 2, "Synthesizing...")
|
73 |
else:
|
74 |
log.warning("Unknown chunk type: %s: %s", chunk_type, json.dumps(chunk))
|
75 |
+
else:
|
76 |
+
log.info("TTS cached at %s", output_tts_path)
|
77 |
|
78 |
audio_duration = sf.info(output_tts_path).duration
|
79 |
+
expected_processing_time = audio_duration / 10 + 20 # 10x real-time on nvidia t4
|
|
|
80 |
|
81 |
+
ts1 = time.time()
|
82 |
output_rvc_path = cache_path(voice.tts, voice.name, rate, text, extension="mp3")
|
83 |
+
rvc_cached = os.path.exists(output_rvc_path)
|
84 |
+
if not rvc_cached:
|
85 |
+
log.info(
|
86 |
+
"Converting %s of audio into %s. Expected duration: %s",
|
87 |
+
seconds_to_ms(audio_duration),
|
88 |
+
output_rvc_path,
|
89 |
+
seconds_to_ms(expected_processing_time),
|
90 |
+
)
|
91 |
last_check = 0.0
|
92 |
+
timeout = httpx.Timeout(5, read=60.0)
|
93 |
endpoint_url = env_str("RVC_ENDPOINT")
|
94 |
async with httpx.AsyncClient(timeout=timeout) as client:
|
95 |
response = await client.post(f"{endpoint_url}/v1/rvc", content=output_tts_path.read_bytes())
|
96 |
+
rvc_start = time.time()
|
97 |
response.raise_for_status()
|
98 |
data = response.json()
|
99 |
log.info("Submitted for conversion: %s", data)
|
100 |
result_url = data["urls"]["result"]
|
101 |
while True:
|
102 |
+
elapsed = time.time() - ts1
|
103 |
+
rvc_elapsed = time.time() - rvc_start
|
104 |
proportion = elapsed / expected_processing_time
|
105 |
pct_complete = 0.5 + math.tanh(proportion) / 2
|
106 |
update_progress(pct_complete, "Processing...")
|
107 |
+
if rvc_elapsed > 0.8 * expected_processing_time and elapsed - last_check > 10:
|
108 |
last_check = elapsed
|
109 |
response = await client.get(result_url)
|
110 |
content_type = response.headers.get("Content-Type")
|
|
|
120 |
else:
|
121 |
log.info("Already converted: %s", output_rvc_path)
|
122 |
|
123 |
+
def format_duration(duration: float) -> str:
|
124 |
+
return "Cached" if duration < 1 else seconds_to_ms(duration)
|
125 |
+
|
126 |
+
def format_wpm(duration: float) -> str:
|
127 |
+
return "Cached" if duration < 1 else f"{word_count * 60 / duration:,.0f}"
|
128 |
+
|
129 |
+
def format_rate(duration: float) -> str:
|
130 |
+
return "Cached" if duration < 1 else f"{audio_duration / duration:.1f}x"
|
131 |
+
|
132 |
+
def format_latency(latency: float) -> str:
|
133 |
+
return "N/A" if latency < 1 else f"{latency:.2f}s"
|
134 |
+
|
135 |
+
ts2 = time.time()
|
136 |
+
total_time = ts2 - ts0
|
137 |
+
rvc_time = ts2 - rvc_start if rvc_start > 0 else 0
|
138 |
+
tts_time = ts1 - tts_start if tts_start > 0 else 0
|
139 |
+
word_count = len(text.split())
|
140 |
+
durations = (audio_duration, total_time, tts_time, rvc_time)
|
141 |
+
times = " | ".join(format_duration(t) for t in durations)
|
142 |
+
wpms = " | ".join(format_wpm(t) for t in durations)
|
143 |
+
rates = " | ".join(format_rate(t) for t in durations)
|
144 |
+
latencies = " | ".join(format_latency(latency) for latency in (0, 0, tts_start - ts0, rvc_start - ts1))
|
145 |
+
rvc_cost = "N/A" if rvc_cached else f"{rvc_time * 0.0164:.1f}¢"
|
146 |
+
markdown_status = f"""
|
147 |
+
Audio successfully synthesized.
|
148 |
+
|
149 |
+
| | Words | Chars | Cost |
|
150 |
+
|-----|------:|------:|-----:|
|
151 |
+
|Count|{word_count:,}|{len(text):,}|{rvc_cost}|
|
152 |
+
|
153 |
+
| |Actual|Processing|TTS|RVC|
|
154 |
+
|-----|-----:|---------:|--:|--:|
|
155 |
+
|Time|{times}|
|
156 |
+
|WPM|{wpms}|
|
157 |
+
|Rate|{rates}|
|
158 |
+
|Latency|{latencies}|
|
159 |
+
|
160 |
+
""".strip()
|
161 |
+
log.info(markdown_status)
|
162 |
+
return markdown_status, str(output_rvc_path)
|
tts_service/utils.py
CHANGED
@@ -61,3 +61,10 @@ def markdown_to_text(md: str) -> str:
|
|
61 |
doc = pandoc.read(md, format="markdown")
|
62 |
txt = pandoc.write(doc, format="plain-smart", options=["--wrap=none"])
|
63 |
return cast(str, txt)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
doc = pandoc.read(md, format="markdown")
|
62 |
txt = pandoc.write(doc, format="plain-smart", options=["--wrap=none"])
|
63 |
return cast(str, txt)
|
64 |
+
|
65 |
+
|
66 |
+
def seconds_to_ms(seconds: float) -> str:
|
67 |
+
minutes = int(seconds) // 60
|
68 |
+
seconds = int(seconds) % 60
|
69 |
+
ds = int((seconds - int(seconds)) * 10)
|
70 |
+
return f"{minutes:02}:{seconds:02}.{ds:01}"
|
tts_service/voices.py
CHANGED
@@ -29,20 +29,7 @@ class S3VoiceObj:
|
|
29 |
|
30 |
class Voice(BaseModel):
|
31 |
name: str
|
32 |
-
model: str
|
33 |
tts: str
|
34 |
-
index: str = ""
|
35 |
-
autotune: float | None = None
|
36 |
-
clean: float | None = 0.5
|
37 |
-
upscale: bool = False
|
38 |
-
pitch: int = 0
|
39 |
-
filter_radius: int = 3
|
40 |
-
index_rate: float = 0.75
|
41 |
-
rms_mix_rate: float = 1
|
42 |
-
protect: float = 0.5
|
43 |
-
hop_length: int = 128
|
44 |
-
f0_method: str = "rmvpe"
|
45 |
-
embedder_model: str = "contentvec"
|
46 |
|
47 |
|
48 |
class TTSVoice(BaseModel):
|
@@ -110,13 +97,9 @@ class VoiceManager:
|
|
110 |
rv = {}
|
111 |
for path in sorted(self.voices_dir.glob("*.json")):
|
112 |
voice = Voice.model_validate_json(path.read_bytes())
|
113 |
-
|
114 |
-
|
115 |
-
logging.warning("Voice %s missing model %s", voice.name, voice.model)
|
116 |
-
elif voice.tts not in self.tts_voices:
|
117 |
-
logging.warning("Voice %s references invalid tts %s", voice.name, voice.model)
|
118 |
else:
|
119 |
-
voice.model = str(model_path)
|
120 |
rv[voice.name] = voice
|
121 |
return rv
|
122 |
|
|
|
29 |
|
30 |
class Voice(BaseModel):
|
31 |
name: str
|
|
|
32 |
tts: str
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
|
35 |
class TTSVoice(BaseModel):
|
|
|
97 |
rv = {}
|
98 |
for path in sorted(self.voices_dir.glob("*.json")):
|
99 |
voice = Voice.model_validate_json(path.read_bytes())
|
100 |
+
if voice.tts not in self.tts_voices:
|
101 |
+
logging.warning("Voice %s references invalid tts %s", voice.name, voice.tts)
|
|
|
|
|
|
|
102 |
else:
|
|
|
103 |
rv[voice.name] = voice
|
104 |
return rv
|
105 |
|