jlopez00 commited on
Commit
1c6f49f
1 Parent(s): 2c01ee6

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. tts_service/tts.py +46 -52
tts_service/tts.py CHANGED
@@ -1,13 +1,17 @@
1
  import asyncio
2
  import json
3
  import logging
 
4
  import os
 
5
  from functools import lru_cache
6
 
7
  import edge_tts
8
  import gradio as gr
 
 
9
 
10
- from tts_service.utils import cache_path
11
  from tts_service.voices import voice_manager
12
 
13
  log = logging.getLogger(__name__)
@@ -27,19 +31,17 @@ async def run_tts_script(
27
  rate: int = 0,
28
  progress=gr.Progress(), # noqa: B008
29
  ) -> tuple[str, str]:
30
- async def update_progress(pct, msg) -> None:
31
  log.debug("Progress: %.1f%%: %s", pct * 100, msg)
32
  progress(pct, msg)
33
- await asyncio.sleep(0)
34
 
35
  log.info("Synthesizing text (%s chars)", len(text))
36
 
37
- await update_progress(0, "Starting...")
38
  voice = voice_manager.voices[voice_name]
39
- format = "wav"
40
 
41
  text = text.strip()
42
- output_tts_path = cache_path(voice.tts, "", rate, text, extension=format)
43
  text_ptr = 0
44
  if not os.path.exists(output_tts_path):
45
  rates = f"+{rate}%" if rate >= 0 else f"{rate}%"
@@ -58,56 +60,48 @@ async def run_tts_script(
58
  text_ptr = text_index + len(chunk_text)
59
  pct_complete = text_ptr / len(text)
60
  log.debug("%.1f%%: %s", pct_complete * 100, chunk)
61
- await update_progress(pct_complete / 2, "Synthesizing...")
62
  else:
63
  log.warning("Unknown chunk type: %s: %s", chunk_type, json.dumps(chunk))
64
 
65
- output_rvc_path = cache_path(voice.tts, voice.name, rate, text, extension=format)
 
 
 
 
66
  if not os.path.exists(output_rvc_path):
67
- infer_pipeline = import_voice_converter()
68
- await infer_pipeline.convert_audio(
69
- pitch=voice.pitch,
70
- filter_radius=voice.filter_radius,
71
- index_rate=voice.index_rate,
72
- volume_envelope=voice.rms_mix_rate,
73
- protect=voice.protect,
74
- hop_length=voice.hop_length,
75
- f0_method=voice.f0_method,
76
- audio_input_path=str(output_tts_path),
77
- audio_output_path=str(output_rvc_path),
78
- model_path=voice.model,
79
- index_path=voice.index,
80
- split_audio=True,
81
- f0_autotune=voice.autotune is not None,
82
- f0_autotune_strength=voice.autotune,
83
- clean_audio=voice.clean is not None,
84
- clean_strength=voice.clean,
85
- export_format=format.upper(),
86
- upscale_audio=voice.upscale,
87
- f0_file=None,
88
- embedder_model=voice.embedder_model,
89
- embedder_model_custom=None,
90
- sid=0,
91
- formant_shifting=None,
92
- formant_qfrency=None,
93
- formant_timbre=None,
94
- post_process=None,
95
- reverb=None,
96
- pitch_shift=None,
97
- limiter=None,
98
- gain=None,
99
- distortion=None,
100
- chorus=None,
101
- bitcrush=None,
102
- clipping=None,
103
- compressor=None,
104
- delay=None,
105
- sliders=None,
106
- callback=lambda pct: update_progress(0.5 + pct / 2, "Converting..."),
107
- )
108
-
109
- log.info("Successfully synthesized text (%s chars)", len(text))
110
- return "Text synthesized successfully.", str(output_rvc_path)
111
 
112
 
113
  # Prerequisites
 
1
  import asyncio
2
  import json
3
  import logging
4
+ import math
5
  import os
6
+ import time
7
  from functools import lru_cache
8
 
9
  import edge_tts
10
  import gradio as gr
11
+ import httpx
12
+ import soundfile as sf
13
 
14
+ from tts_service.utils import cache_path, env_str
15
  from tts_service.voices import voice_manager
16
 
17
  log = logging.getLogger(__name__)
 
31
  rate: int = 0,
32
  progress=gr.Progress(), # noqa: B008
33
  ) -> tuple[str, str]:
34
+ def update_progress(pct, msg) -> None:
35
  log.debug("Progress: %.1f%%: %s", pct * 100, msg)
36
  progress(pct, msg)
 
37
 
38
  log.info("Synthesizing text (%s chars)", len(text))
39
 
40
+ update_progress(0, "Starting...")
41
  voice = voice_manager.voices[voice_name]
 
42
 
43
  text = text.strip()
44
+ output_tts_path = cache_path(voice.tts, "", rate, text, extension="mp3")
45
  text_ptr = 0
46
  if not os.path.exists(output_tts_path):
47
  rates = f"+{rate}%" if rate >= 0 else f"{rate}%"
 
60
  text_ptr = text_index + len(chunk_text)
61
  pct_complete = text_ptr / len(text)
62
  log.debug("%.1f%%: %s", pct_complete * 100, chunk)
63
+ update_progress(pct_complete / 2, "Synthesizing...")
64
  else:
65
  log.warning("Unknown chunk type: %s: %s", chunk_type, json.dumps(chunk))
66
 
67
+ audio_duration = sf.info(output_tts_path).duration
68
+ expected_processing_time = audio_duration / 8 + 10 # 10x real-time on nvidia t4
69
+ log.info(f"Synthesized {audio_duration:,.0f}s, expected processing time: {expected_processing_time:,.0f}s")
70
+
71
+ output_rvc_path = cache_path(voice.tts, voice.name, rate, text, extension="mp3")
72
  if not os.path.exists(output_rvc_path):
73
+ ts0 = time.time()
74
+ last_check = 0.0
75
+ timeout = httpx.Timeout(5, read=15.0)
76
+ endpoint_url = env_str("RVC_ENDPOINT")
77
+ async with httpx.AsyncClient(timeout=timeout) as client:
78
+ response = await client.post(f"{endpoint_url}/v1/rvc", content=output_tts_path.read_bytes())
79
+ response.raise_for_status()
80
+ data = response.json()
81
+ log.info("Submitted for conversion: %s", data)
82
+ result_url = data["urls"]["result"]
83
+ while True:
84
+ elapsed = time.time() - ts0
85
+ proportion = elapsed / expected_processing_time
86
+ pct_complete = 0.5 + math.tanh(proportion) / 2
87
+ update_progress(pct_complete, "Processing...")
88
+ if elapsed > 0.8 * expected_processing_time and elapsed - last_check > 10:
89
+ last_check = elapsed
90
+ response = await client.get(result_url)
91
+ content_type = response.headers.get("Content-Type")
92
+ processed_bytes = await response.aread()
93
+ log.info(f"Checking status: %s (%s) {len(processed_bytes):,} bytes", response.status_code, content_type)
94
+ if response.status_code == 200 and content_type == "audio/mpeg":
95
+ output_rvc_path.write_bytes(processed_bytes)
96
+ break
97
+ elif response.status_code != 404:
98
+ response.raise_for_status()
99
+ await asyncio.sleep(0.1)
100
+ log.info("Successfully converted text (%s chars) -> %s", len(text), output_rvc_path)
101
+ else:
102
+ log.info("Already converted: %s", output_rvc_path)
103
+
104
+ return f"{audio_duration:,.0f}s of audio successfully synthesized.", str(output_rvc_path)
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
 
107
  # Prerequisites