Spaces:

jlopez00
/

tts-service

Runtime error

App Files Files Community

jlopez00 commited on Dec 4, 2024

Commit

571ece7

verified ·

1 Parent(s): 1c6f49f

Upload folder using huggingface_hub

Browse files

Files changed (5) hide show

tabs/tts/tts.py +1 -1
tabs/workflow/workflow.py +11 -3
tts_service/tts.py +71 -16
tts_service/utils.py +7 -0
tts_service/voices.py +2 -19

tabs/tts/tts.py CHANGED Viewed

@@ -47,7 +47,7 @@ def tts_tab():
             label=i18n("Output Information"),
             info=i18n("The output information will be displayed here."),
         )
-        vc_output2 = gr.Audio(label=i18n("Export Audio"))
     convert_button.click(
         fn=run_tts_script,

             label=i18n("Output Information"),
             info=i18n("The output information will be displayed here."),
         )
+        vc_output2 = gr.Audio(label=i18n("Generated Audio"))
     convert_button.click(
         fn=run_tts_script,

tabs/workflow/workflow.py CHANGED Viewed

@@ -48,28 +48,36 @@ def workflow_tab():
                 label=i18n("Source"),
                 info=i18n("Enter the document ID or URL."),
             )
             fetch_button = gr.Button(i18n("Fetch"))
             text = gr.Textbox(
                 label=i18n("Text"),
-                interactive=True,
             )
             voice = gr.Dropdown(
                 label=i18n("Voice"),
                 choices=voice_manager.voices.keys(),
                 value=voice_manager.voice_names[0],
             )
             synthesize_button = gr.Button(i18n("Synthesize"))
-            status = gr.Textbox(visible=False)
-            audio = gr.Audio(label=i18n("Export Audio"))
         with gr.Column():
             markdown = gr.Markdown(
                 label=i18n("Document"),
             )
     fetch_button.click(

                 label=i18n("Source"),
                 info=i18n("Enter the document ID or URL."),
             )
             fetch_button = gr.Button(i18n("Fetch"))
             text = gr.Textbox(
                 label=i18n("Text"),
+                visible=False,
             )
             voice = gr.Dropdown(
                 label=i18n("Voice"),
                 choices=voice_manager.voices.keys(),
                 value=voice_manager.voice_names[0],
+                visible=len(voice_manager.voices) > 1,
             )
             synthesize_button = gr.Button(i18n("Synthesize"))
+            audio = gr.Audio(
+                label=i18n("Generated Audio"),
+            )
+            status = gr.Markdown(
+                label=i18n("Status"),
+                show_label=True,
+            )
         with gr.Column():
             markdown = gr.Markdown(
                 label=i18n("Document"),
+                show_label=True,
             )
     fetch_button.click(

tts_service/tts.py CHANGED Viewed

@@ -4,6 +4,7 @@ import logging
 import math
 import os
 import time
 from functools import lru_cache
 import edge_tts
@@ -11,7 +12,7 @@ import gradio as gr
 import httpx
 import soundfile as sf
-from tts_service.utils import cache_path, env_str
 from tts_service.voices import voice_manager
 log = logging.getLogger(__name__)
@@ -35,25 +36,33 @@ async def run_tts_script(
         log.debug("Progress: %.1f%%: %s", pct * 100, msg)
         progress(pct, msg)
-    log.info("Synthesizing text (%s chars)", len(text))
     update_progress(0, "Starting...")
     voice = voice_manager.voices[voice_name]
     text = text.strip()
     output_tts_path = cache_path(voice.tts, "", rate, text, extension="mp3")
     text_ptr = 0
-    if not os.path.exists(output_tts_path):
         rates = f"+{rate}%" if rate >= 0 else f"{rate}%"
         communicate = edge_tts.Communicate(text, voice.tts, rate=rates)
         with open(output_tts_path, "wb") as f:
             async for chunk in communicate.stream():
                 chunk_type = chunk["type"]
                 if chunk_type == "audio":
                     f.write(chunk["data"])
                 elif chunk_type == "WordBoundary":
                     chunk_text = chunk["text"]
-                    text_index = text.index(chunk_text, text_ptr)
                     if text_index == -1:
                         log.warning("Extraneous text received from edge tts: %s", chunk_text)
                         continue
@@ -63,29 +72,39 @@ async def run_tts_script(
                     update_progress(pct_complete / 2, "Synthesizing...")
                 else:
                     log.warning("Unknown chunk type: %s: %s", chunk_type, json.dumps(chunk))
     audio_duration = sf.info(output_tts_path).duration
-    expected_processing_time = audio_duration / 8 + 10  # 10x real-time on nvidia t4
-    log.info(f"Synthesized {audio_duration:,.0f}s, expected processing time: {expected_processing_time:,.0f}s")
     output_rvc_path = cache_path(voice.tts, voice.name, rate, text, extension="mp3")
-    if not os.path.exists(output_rvc_path):
-        ts0 = time.time()
         last_check = 0.0
-        timeout = httpx.Timeout(5, read=15.0)
         endpoint_url = env_str("RVC_ENDPOINT")
         async with httpx.AsyncClient(timeout=timeout) as client:
             response = await client.post(f"{endpoint_url}/v1/rvc", content=output_tts_path.read_bytes())
             response.raise_for_status()
             data = response.json()
             log.info("Submitted for conversion: %s", data)
             result_url = data["urls"]["result"]
             while True:
-                elapsed = time.time() - ts0
                 proportion = elapsed / expected_processing_time
                 pct_complete = 0.5 + math.tanh(proportion) / 2
                 update_progress(pct_complete, "Processing...")
-                if elapsed > 0.8 * expected_processing_time and elapsed - last_check > 10:
                     last_check = elapsed
                     response = await client.get(result_url)
                     content_type = response.headers.get("Content-Type")
@@ -101,7 +120,43 @@ async def run_tts_script(
     else:
         log.info("Already converted: %s", output_rvc_path)
-    return f"{audio_duration:,.0f}s of audio successfully synthesized.", str(output_rvc_path)
-# Prerequisites

 import math
 import os
 import time
+from contextlib import suppress
 from functools import lru_cache
 import edge_tts
 import httpx
 import soundfile as sf
+from tts_service.utils import cache_path, env_str, seconds_to_ms
 from tts_service.voices import voice_manager
 log = logging.getLogger(__name__)
         log.debug("Progress: %.1f%%: %s", pct * 100, msg)
         progress(pct, msg)
+    tts_start = -1.0
+    rvc_start = -1.0
+    ts0 = time.time()
     update_progress(0, "Starting...")
     voice = voice_manager.voices[voice_name]
     text = text.strip()
     output_tts_path = cache_path(voice.tts, "", rate, text, extension="mp3")
     text_ptr = 0
+    tts_cached = os.path.exists(output_tts_path)
+    if not tts_cached:
+        log.info("Synthesizing %s chars into %s", len(text), output_tts_path)
         rates = f"+{rate}%" if rate >= 0 else f"{rate}%"
         communicate = edge_tts.Communicate(text, voice.tts, rate=rates)
         with open(output_tts_path, "wb") as f:
             async for chunk in communicate.stream():
+                if tts_start < 0:
+                    tts_start = time.time()
                 chunk_type = chunk["type"]
                 if chunk_type == "audio":
                     f.write(chunk["data"])
                 elif chunk_type == "WordBoundary":
                     chunk_text = chunk["text"]
+                    text_index = -1
+                    with suppress(ValueError):
+                        text_index = text.index(chunk_text, text_ptr)
                     if text_index == -1:
                         log.warning("Extraneous text received from edge tts: %s", chunk_text)
                         continue
                     update_progress(pct_complete / 2, "Synthesizing...")
                 else:
                     log.warning("Unknown chunk type: %s: %s", chunk_type, json.dumps(chunk))
+    else:
+        log.info("TTS cached at %s", output_tts_path)
     audio_duration = sf.info(output_tts_path).duration
+    expected_processing_time = audio_duration / 10 + 20  # 10x real-time on nvidia t4
+    ts1 = time.time()
     output_rvc_path = cache_path(voice.tts, voice.name, rate, text, extension="mp3")
+    rvc_cached = os.path.exists(output_rvc_path)
+    if not rvc_cached:
+        log.info(
+            "Converting %s of audio into %s. Expected duration: %s",
+            seconds_to_ms(audio_duration),
+            output_rvc_path,
+            seconds_to_ms(expected_processing_time),
+        )
         last_check = 0.0
+        timeout = httpx.Timeout(5, read=60.0)
         endpoint_url = env_str("RVC_ENDPOINT")
         async with httpx.AsyncClient(timeout=timeout) as client:
             response = await client.post(f"{endpoint_url}/v1/rvc", content=output_tts_path.read_bytes())
+            rvc_start = time.time()
             response.raise_for_status()
             data = response.json()
             log.info("Submitted for conversion: %s", data)
             result_url = data["urls"]["result"]
             while True:
+                elapsed = time.time() - ts1
+                rvc_elapsed = time.time() - rvc_start
                 proportion = elapsed / expected_processing_time
                 pct_complete = 0.5 + math.tanh(proportion) / 2
                 update_progress(pct_complete, "Processing...")
+                if rvc_elapsed > 0.8 * expected_processing_time and elapsed - last_check > 10:
                     last_check = elapsed
                     response = await client.get(result_url)
                     content_type = response.headers.get("Content-Type")
     else:
         log.info("Already converted: %s", output_rvc_path)
+    def format_duration(duration: float) -> str:
+        return "Cached" if duration < 1 else seconds_to_ms(duration)
+    def format_wpm(duration: float) -> str:
+        return "Cached" if duration < 1 else f"{word_count * 60 / duration:,.0f}"
+    def format_rate(duration: float) -> str:
+        return "Cached" if duration < 1 else f"{audio_duration / duration:.1f}x"
+    def format_latency(latency: float) -> str:
+        return "N/A" if latency < 1 else f"{latency:.2f}s"
+    ts2 = time.time()
+    total_time = ts2 - ts0
+    rvc_time = ts2 - rvc_start if rvc_start > 0 else 0
+    tts_time = ts1 - tts_start if tts_start > 0 else 0
+    word_count = len(text.split())
+    durations = (audio_duration, total_time, tts_time, rvc_time)
+    times = " | ".join(format_duration(t) for t in durations)
+    wpms = " | ".join(format_wpm(t) for t in durations)
+    rates = " | ".join(format_rate(t) for t in durations)
+    latencies = " | ".join(format_latency(latency) for latency in (0, 0, tts_start - ts0, rvc_start - ts1))
+    rvc_cost = "N/A" if rvc_cached else f"{rvc_time * 0.0164:.1f}¢"
+    markdown_status = f"""
+Audio successfully synthesized.
+|     | Words | Chars | Cost |
+|-----|------:|------:|-----:|
+|Count|{word_count:,}|{len(text):,}|{rvc_cost}|
+|     |Actual|Processing|TTS|RVC|
+|-----|-----:|---------:|--:|--:|
+|Time|{times}|
+|WPM|{wpms}|
+|Rate|{rates}|
+|Latency|{latencies}|
+""".strip()
+    log.info(markdown_status)
+    return markdown_status, str(output_rvc_path)

tts_service/utils.py CHANGED Viewed

@@ -61,3 +61,10 @@ def markdown_to_text(md: str) -> str:
     doc = pandoc.read(md, format="markdown")
     txt = pandoc.write(doc, format="plain-smart", options=["--wrap=none"])
     return cast(str, txt)

     doc = pandoc.read(md, format="markdown")
     txt = pandoc.write(doc, format="plain-smart", options=["--wrap=none"])
     return cast(str, txt)
+def seconds_to_ms(seconds: float) -> str:
+    minutes = int(seconds) // 60
+    seconds = int(seconds) % 60
+    ds = int((seconds - int(seconds)) * 10)
+    return f"{minutes:02}:{seconds:02}.{ds:01}"

tts_service/voices.py CHANGED Viewed

@@ -29,20 +29,7 @@ class S3VoiceObj:
 class Voice(BaseModel):
     name: str
-    model: str
     tts: str
-    index: str = ""
-    autotune: float | None = None
-    clean: float | None = 0.5
-    upscale: bool = False
-    pitch: int = 0
-    filter_radius: int = 3
-    index_rate: float = 0.75
-    rms_mix_rate: float = 1
-    protect: float = 0.5
-    hop_length: int = 128
-    f0_method: str = "rmvpe"
-    embedder_model: str = "contentvec"
 class TTSVoice(BaseModel):
@@ -110,13 +97,9 @@ class VoiceManager:
         rv = {}
         for path in sorted(self.voices_dir.glob("*.json")):
             voice = Voice.model_validate_json(path.read_bytes())
-            model_path = self.voices_dir / f"{voice.model}"
-            if not model_path.exists():
-                logging.warning("Voice %s missing model %s", voice.name, voice.model)
-            elif voice.tts not in self.tts_voices:
-                logging.warning("Voice %s references invalid tts %s", voice.name, voice.model)
             else:
-                voice.model = str(model_path)
                 rv[voice.name] = voice
         return rv

 class Voice(BaseModel):
     name: str
     tts: str
 class TTSVoice(BaseModel):
         rv = {}
         for path in sorted(self.voices_dir.glob("*.json")):
             voice = Voice.model_validate_json(path.read_bytes())
+            if voice.tts not in self.tts_voices:
+                logging.warning("Voice %s references invalid tts %s", voice.name, voice.tts)
             else:
                 rv[voice.name] = voice
         return rv