jlopez00 commited on
Commit
571ece7
1 Parent(s): 1c6f49f

Upload folder using huggingface_hub

Browse files
tabs/tts/tts.py CHANGED
@@ -47,7 +47,7 @@ def tts_tab():
47
  label=i18n("Output Information"),
48
  info=i18n("The output information will be displayed here."),
49
  )
50
- vc_output2 = gr.Audio(label=i18n("Export Audio"))
51
 
52
  convert_button.click(
53
  fn=run_tts_script,
 
47
  label=i18n("Output Information"),
48
  info=i18n("The output information will be displayed here."),
49
  )
50
+ vc_output2 = gr.Audio(label=i18n("Generated Audio"))
51
 
52
  convert_button.click(
53
  fn=run_tts_script,
tabs/workflow/workflow.py CHANGED
@@ -48,28 +48,36 @@ def workflow_tab():
48
  label=i18n("Source"),
49
  info=i18n("Enter the document ID or URL."),
50
  )
 
51
  fetch_button = gr.Button(i18n("Fetch"))
52
 
53
  text = gr.Textbox(
54
  label=i18n("Text"),
55
- interactive=True,
56
  )
57
 
58
  voice = gr.Dropdown(
59
  label=i18n("Voice"),
60
  choices=voice_manager.voices.keys(),
61
  value=voice_manager.voice_names[0],
 
62
  )
63
 
64
  synthesize_button = gr.Button(i18n("Synthesize"))
65
 
66
- status = gr.Textbox(visible=False)
 
 
67
 
68
- audio = gr.Audio(label=i18n("Export Audio"))
 
 
 
69
 
70
  with gr.Column():
71
  markdown = gr.Markdown(
72
  label=i18n("Document"),
 
73
  )
74
 
75
  fetch_button.click(
 
48
  label=i18n("Source"),
49
  info=i18n("Enter the document ID or URL."),
50
  )
51
+
52
  fetch_button = gr.Button(i18n("Fetch"))
53
 
54
  text = gr.Textbox(
55
  label=i18n("Text"),
56
+ visible=False,
57
  )
58
 
59
  voice = gr.Dropdown(
60
  label=i18n("Voice"),
61
  choices=voice_manager.voices.keys(),
62
  value=voice_manager.voice_names[0],
63
+ visible=len(voice_manager.voices) > 1,
64
  )
65
 
66
  synthesize_button = gr.Button(i18n("Synthesize"))
67
 
68
+ audio = gr.Audio(
69
+ label=i18n("Generated Audio"),
70
+ )
71
 
72
+ status = gr.Markdown(
73
+ label=i18n("Status"),
74
+ show_label=True,
75
+ )
76
 
77
  with gr.Column():
78
  markdown = gr.Markdown(
79
  label=i18n("Document"),
80
+ show_label=True,
81
  )
82
 
83
  fetch_button.click(
tts_service/tts.py CHANGED
@@ -4,6 +4,7 @@ import logging
4
  import math
5
  import os
6
  import time
 
7
  from functools import lru_cache
8
 
9
  import edge_tts
@@ -11,7 +12,7 @@ import gradio as gr
11
  import httpx
12
  import soundfile as sf
13
 
14
- from tts_service.utils import cache_path, env_str
15
  from tts_service.voices import voice_manager
16
 
17
  log = logging.getLogger(__name__)
@@ -35,25 +36,33 @@ async def run_tts_script(
35
  log.debug("Progress: %.1f%%: %s", pct * 100, msg)
36
  progress(pct, msg)
37
 
38
- log.info("Synthesizing text (%s chars)", len(text))
39
-
 
40
  update_progress(0, "Starting...")
41
  voice = voice_manager.voices[voice_name]
42
 
43
  text = text.strip()
44
  output_tts_path = cache_path(voice.tts, "", rate, text, extension="mp3")
45
  text_ptr = 0
46
- if not os.path.exists(output_tts_path):
 
 
47
  rates = f"+{rate}%" if rate >= 0 else f"{rate}%"
48
  communicate = edge_tts.Communicate(text, voice.tts, rate=rates)
49
  with open(output_tts_path, "wb") as f:
50
  async for chunk in communicate.stream():
 
 
 
51
  chunk_type = chunk["type"]
52
  if chunk_type == "audio":
53
  f.write(chunk["data"])
54
  elif chunk_type == "WordBoundary":
55
  chunk_text = chunk["text"]
56
- text_index = text.index(chunk_text, text_ptr)
 
 
57
  if text_index == -1:
58
  log.warning("Extraneous text received from edge tts: %s", chunk_text)
59
  continue
@@ -63,29 +72,39 @@ async def run_tts_script(
63
  update_progress(pct_complete / 2, "Synthesizing...")
64
  else:
65
  log.warning("Unknown chunk type: %s: %s", chunk_type, json.dumps(chunk))
 
 
66
 
67
  audio_duration = sf.info(output_tts_path).duration
68
- expected_processing_time = audio_duration / 8 + 10 # 10x real-time on nvidia t4
69
- log.info(f"Synthesized {audio_duration:,.0f}s, expected processing time: {expected_processing_time:,.0f}s")
70
 
 
71
  output_rvc_path = cache_path(voice.tts, voice.name, rate, text, extension="mp3")
72
- if not os.path.exists(output_rvc_path):
73
- ts0 = time.time()
 
 
 
 
 
 
74
  last_check = 0.0
75
- timeout = httpx.Timeout(5, read=15.0)
76
  endpoint_url = env_str("RVC_ENDPOINT")
77
  async with httpx.AsyncClient(timeout=timeout) as client:
78
  response = await client.post(f"{endpoint_url}/v1/rvc", content=output_tts_path.read_bytes())
 
79
  response.raise_for_status()
80
  data = response.json()
81
  log.info("Submitted for conversion: %s", data)
82
  result_url = data["urls"]["result"]
83
  while True:
84
- elapsed = time.time() - ts0
 
85
  proportion = elapsed / expected_processing_time
86
  pct_complete = 0.5 + math.tanh(proportion) / 2
87
  update_progress(pct_complete, "Processing...")
88
- if elapsed > 0.8 * expected_processing_time and elapsed - last_check > 10:
89
  last_check = elapsed
90
  response = await client.get(result_url)
91
  content_type = response.headers.get("Content-Type")
@@ -101,7 +120,43 @@ async def run_tts_script(
101
  else:
102
  log.info("Already converted: %s", output_rvc_path)
103
 
104
- return f"{audio_duration:,.0f}s of audio successfully synthesized.", str(output_rvc_path)
105
-
106
-
107
- # Prerequisites
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  import math
5
  import os
6
  import time
7
+ from contextlib import suppress
8
  from functools import lru_cache
9
 
10
  import edge_tts
 
12
  import httpx
13
  import soundfile as sf
14
 
15
+ from tts_service.utils import cache_path, env_str, seconds_to_ms
16
  from tts_service.voices import voice_manager
17
 
18
  log = logging.getLogger(__name__)
 
36
  log.debug("Progress: %.1f%%: %s", pct * 100, msg)
37
  progress(pct, msg)
38
 
39
+ tts_start = -1.0
40
+ rvc_start = -1.0
41
+ ts0 = time.time()
42
  update_progress(0, "Starting...")
43
  voice = voice_manager.voices[voice_name]
44
 
45
  text = text.strip()
46
  output_tts_path = cache_path(voice.tts, "", rate, text, extension="mp3")
47
  text_ptr = 0
48
+ tts_cached = os.path.exists(output_tts_path)
49
+ if not tts_cached:
50
+ log.info("Synthesizing %s chars into %s", len(text), output_tts_path)
51
  rates = f"+{rate}%" if rate >= 0 else f"{rate}%"
52
  communicate = edge_tts.Communicate(text, voice.tts, rate=rates)
53
  with open(output_tts_path, "wb") as f:
54
  async for chunk in communicate.stream():
55
+ if tts_start < 0:
56
+ tts_start = time.time()
57
+
58
  chunk_type = chunk["type"]
59
  if chunk_type == "audio":
60
  f.write(chunk["data"])
61
  elif chunk_type == "WordBoundary":
62
  chunk_text = chunk["text"]
63
+ text_index = -1
64
+ with suppress(ValueError):
65
+ text_index = text.index(chunk_text, text_ptr)
66
  if text_index == -1:
67
  log.warning("Extraneous text received from edge tts: %s", chunk_text)
68
  continue
 
72
  update_progress(pct_complete / 2, "Synthesizing...")
73
  else:
74
  log.warning("Unknown chunk type: %s: %s", chunk_type, json.dumps(chunk))
75
+ else:
76
+ log.info("TTS cached at %s", output_tts_path)
77
 
78
  audio_duration = sf.info(output_tts_path).duration
79
+ expected_processing_time = audio_duration / 10 + 20 # 10x real-time on nvidia t4
 
80
 
81
+ ts1 = time.time()
82
  output_rvc_path = cache_path(voice.tts, voice.name, rate, text, extension="mp3")
83
+ rvc_cached = os.path.exists(output_rvc_path)
84
+ if not rvc_cached:
85
+ log.info(
86
+ "Converting %s of audio into %s. Expected duration: %s",
87
+ seconds_to_ms(audio_duration),
88
+ output_rvc_path,
89
+ seconds_to_ms(expected_processing_time),
90
+ )
91
  last_check = 0.0
92
+ timeout = httpx.Timeout(5, read=60.0)
93
  endpoint_url = env_str("RVC_ENDPOINT")
94
  async with httpx.AsyncClient(timeout=timeout) as client:
95
  response = await client.post(f"{endpoint_url}/v1/rvc", content=output_tts_path.read_bytes())
96
+ rvc_start = time.time()
97
  response.raise_for_status()
98
  data = response.json()
99
  log.info("Submitted for conversion: %s", data)
100
  result_url = data["urls"]["result"]
101
  while True:
102
+ elapsed = time.time() - ts1
103
+ rvc_elapsed = time.time() - rvc_start
104
  proportion = elapsed / expected_processing_time
105
  pct_complete = 0.5 + math.tanh(proportion) / 2
106
  update_progress(pct_complete, "Processing...")
107
+ if rvc_elapsed > 0.8 * expected_processing_time and elapsed - last_check > 10:
108
  last_check = elapsed
109
  response = await client.get(result_url)
110
  content_type = response.headers.get("Content-Type")
 
120
  else:
121
  log.info("Already converted: %s", output_rvc_path)
122
 
123
+ def format_duration(duration: float) -> str:
124
+ return "Cached" if duration < 1 else seconds_to_ms(duration)
125
+
126
+ def format_wpm(duration: float) -> str:
127
+ return "Cached" if duration < 1 else f"{word_count * 60 / duration:,.0f}"
128
+
129
+ def format_rate(duration: float) -> str:
130
+ return "Cached" if duration < 1 else f"{audio_duration / duration:.1f}x"
131
+
132
+ def format_latency(latency: float) -> str:
133
+ return "N/A" if latency < 1 else f"{latency:.2f}s"
134
+
135
+ ts2 = time.time()
136
+ total_time = ts2 - ts0
137
+ rvc_time = ts2 - rvc_start if rvc_start > 0 else 0
138
+ tts_time = ts1 - tts_start if tts_start > 0 else 0
139
+ word_count = len(text.split())
140
+ durations = (audio_duration, total_time, tts_time, rvc_time)
141
+ times = " | ".join(format_duration(t) for t in durations)
142
+ wpms = " | ".join(format_wpm(t) for t in durations)
143
+ rates = " | ".join(format_rate(t) for t in durations)
144
+ latencies = " | ".join(format_latency(latency) for latency in (0, 0, tts_start - ts0, rvc_start - ts1))
145
+ rvc_cost = "N/A" if rvc_cached else f"{rvc_time * 0.0164:.1f}¢"
146
+ markdown_status = f"""
147
+ Audio successfully synthesized.
148
+
149
+ | | Words | Chars | Cost |
150
+ |-----|------:|------:|-----:|
151
+ |Count|{word_count:,}|{len(text):,}|{rvc_cost}|
152
+
153
+ | |Actual|Processing|TTS|RVC|
154
+ |-----|-----:|---------:|--:|--:|
155
+ |Time|{times}|
156
+ |WPM|{wpms}|
157
+ |Rate|{rates}|
158
+ |Latency|{latencies}|
159
+
160
+ """.strip()
161
+ log.info(markdown_status)
162
+ return markdown_status, str(output_rvc_path)
tts_service/utils.py CHANGED
@@ -61,3 +61,10 @@ def markdown_to_text(md: str) -> str:
61
  doc = pandoc.read(md, format="markdown")
62
  txt = pandoc.write(doc, format="plain-smart", options=["--wrap=none"])
63
  return cast(str, txt)
 
 
 
 
 
 
 
 
61
  doc = pandoc.read(md, format="markdown")
62
  txt = pandoc.write(doc, format="plain-smart", options=["--wrap=none"])
63
  return cast(str, txt)
64
+
65
+
66
+ def seconds_to_ms(seconds: float) -> str:
67
+ minutes = int(seconds) // 60
68
+ seconds = int(seconds) % 60
69
+ ds = int((seconds - int(seconds)) * 10)
70
+ return f"{minutes:02}:{seconds:02}.{ds:01}"
tts_service/voices.py CHANGED
@@ -29,20 +29,7 @@ class S3VoiceObj:
29
 
30
  class Voice(BaseModel):
31
  name: str
32
- model: str
33
  tts: str
34
- index: str = ""
35
- autotune: float | None = None
36
- clean: float | None = 0.5
37
- upscale: bool = False
38
- pitch: int = 0
39
- filter_radius: int = 3
40
- index_rate: float = 0.75
41
- rms_mix_rate: float = 1
42
- protect: float = 0.5
43
- hop_length: int = 128
44
- f0_method: str = "rmvpe"
45
- embedder_model: str = "contentvec"
46
 
47
 
48
  class TTSVoice(BaseModel):
@@ -110,13 +97,9 @@ class VoiceManager:
110
  rv = {}
111
  for path in sorted(self.voices_dir.glob("*.json")):
112
  voice = Voice.model_validate_json(path.read_bytes())
113
- model_path = self.voices_dir / f"{voice.model}"
114
- if not model_path.exists():
115
- logging.warning("Voice %s missing model %s", voice.name, voice.model)
116
- elif voice.tts not in self.tts_voices:
117
- logging.warning("Voice %s references invalid tts %s", voice.name, voice.model)
118
  else:
119
- voice.model = str(model_path)
120
  rv[voice.name] = voice
121
  return rv
122
 
 
29
 
30
  class Voice(BaseModel):
31
  name: str
 
32
  tts: str
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
 
35
  class TTSVoice(BaseModel):
 
97
  rv = {}
98
  for path in sorted(self.voices_dir.glob("*.json")):
99
  voice = Voice.model_validate_json(path.read_bytes())
100
+ if voice.tts not in self.tts_voices:
101
+ logging.warning("Voice %s references invalid tts %s", voice.name, voice.tts)
 
 
 
102
  else:
 
103
  rv[voice.name] = voice
104
  return rv
105