Spaces:
Running
Running
glide-the
commited on
Commit
β’
8953210
1
Parent(s):
ffe5bd7
Add large files to Git LFS
Browse files- Dockerfile +21 -0
- README.md +11 -0
- requirements.txt +52 -0
- setup.py +36 -0
- start.py +4 -0
- util.py +107 -0
Dockerfile
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.10
|
2 |
+
|
3 |
+
RUN apt update && apt install -y cmake gcc portaudio19-dev
|
4 |
+
|
5 |
+
WORKDIR /code
|
6 |
+
ENV NUMBA_CACHE_DIR=/tmp/
|
7 |
+
COPY ./requirements.txt /code/requirements.txt
|
8 |
+
|
9 |
+
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
|
10 |
+
COPY . /code/
|
11 |
+
|
12 |
+
RUN pip install -e .
|
13 |
+
|
14 |
+
RUN cd /code/vits/monotonic_align && \
|
15 |
+
mkdir -p /code/vits/monotonic_align/vits/monotonic_align/ && \
|
16 |
+
python setup.py build_ext --inplace && \
|
17 |
+
mv /code/vits/monotonic_align/vits/monotonic_align/* /code/vits/monotonic_align/
|
18 |
+
|
19 |
+
CMD ["python", "-m", "speakers", "--verbose", "--mode", "web"]
|
20 |
+
|
21 |
+
EXPOSE 7860
|
README.md
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: RVC Speakers
|
3 |
+
emoji: π
|
4 |
+
colorFrom: green
|
5 |
+
colorTo: red
|
6 |
+
sdk: docker
|
7 |
+
pinned: false
|
8 |
+
license: bsd-3-clause
|
9 |
+
---
|
10 |
+
|
11 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
requirements.txt
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Cython==0.29.21
|
2 |
+
# vits_text
|
3 |
+
unidecode
|
4 |
+
# pre uninstall cmake
|
5 |
+
pyopenjtalk
|
6 |
+
jamo
|
7 |
+
pypinyin
|
8 |
+
jieba
|
9 |
+
cn2an
|
10 |
+
# vits_text
|
11 |
+
|
12 |
+
nest_asyncio
|
13 |
+
gradio==3.33.1
|
14 |
+
fairseq
|
15 |
+
torch
|
16 |
+
torchaudio
|
17 |
+
soundfile
|
18 |
+
scipy==1.9.3
|
19 |
+
librosa==0.9.1
|
20 |
+
musicdl
|
21 |
+
pyaudio
|
22 |
+
torchcrepe==0.0.20
|
23 |
+
praat-parselmouth>=0.4.2
|
24 |
+
pyworld==0.3.2
|
25 |
+
faiss-cpu==1.7.3
|
26 |
+
numpy==1.23.5
|
27 |
+
nltk
|
28 |
+
|
29 |
+
edge-tts
|
30 |
+
IPython
|
31 |
+
tqdm
|
32 |
+
pandas
|
33 |
+
|
34 |
+
## bark
|
35 |
+
transformers
|
36 |
+
encodec
|
37 |
+
huggingface-hub>=0.14.1
|
38 |
+
funcy
|
39 |
+
|
40 |
+
# config manage
|
41 |
+
omegaconf
|
42 |
+
pydantic
|
43 |
+
|
44 |
+
# log
|
45 |
+
colorama
|
46 |
+
|
47 |
+
# server
|
48 |
+
fastapi~=0.99.1
|
49 |
+
starlette~=0.27.0
|
50 |
+
uvicorn~=0.23.1
|
51 |
+
requests
|
52 |
+
oscrypto
|
setup.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Copyright (c) 2022, salesforce.com, inc.
|
3 |
+
All rights reserved.
|
4 |
+
SPDX-License-Identifier: BSD-3-Clause
|
5 |
+
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
6 |
+
"""
|
7 |
+
|
8 |
+
from setuptools import setup, find_namespace_packages
|
9 |
+
import platform
|
10 |
+
|
11 |
+
DEPENDENCY_LINKS = []
|
12 |
+
if platform.system() == "Windows":
|
13 |
+
DEPENDENCY_LINKS.append("https://download.pytorch.org/whl/torch_stable.html")
|
14 |
+
|
15 |
+
|
16 |
+
def fetch_requirements(filename):
|
17 |
+
with open(filename) as f:
|
18 |
+
return [ln.strip() for ln in f.read().split("\n")]
|
19 |
+
|
20 |
+
|
21 |
+
setup(
|
22 |
+
name="speakers",
|
23 |
+
version="0.0.1",
|
24 |
+
author="glide-the",
|
25 |
+
description="Ready Voice Controller οΌ generate for End-to-End Text-to-SpeechοΌwith Multi-engine integration",
|
26 |
+
long_description=open("README.md", "r", encoding="utf-8").read(),
|
27 |
+
long_description_content_type="text/markdown",
|
28 |
+
keywords="Speakers, Multimodal, Ready Voice Controller",
|
29 |
+
license="3-Clause BSD",
|
30 |
+
packages=find_namespace_packages(include="speakers.*"),
|
31 |
+
install_requires=fetch_requirements("requirements.txt"),
|
32 |
+
python_requires=">=3.9.0",
|
33 |
+
include_package_data=True,
|
34 |
+
dependency_links=DEPENDENCY_LINKS,
|
35 |
+
zip_safe=False,
|
36 |
+
)
|
start.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from speakers.__main__ import main
|
2 |
+
|
3 |
+
if __name__ == '__main__':
|
4 |
+
main()
|
util.py
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import asyncio
|
3 |
+
from io import BytesIO
|
4 |
+
|
5 |
+
from fairseq import checkpoint_utils
|
6 |
+
|
7 |
+
import torch
|
8 |
+
|
9 |
+
import edge_tts
|
10 |
+
import librosa
|
11 |
+
|
12 |
+
|
13 |
+
# https://github.com/fumiama/Retrieval-based-Voice-Conversion-WebUI/blob/main/config.py#L43-L55 # noqa
|
14 |
+
def has_mps() -> bool:
|
15 |
+
if sys.platform != "darwin":
|
16 |
+
return False
|
17 |
+
else:
|
18 |
+
if not getattr(torch, 'has_mps', False):
|
19 |
+
return False
|
20 |
+
|
21 |
+
try:
|
22 |
+
torch.zeros(1).to(torch.device("mps"))
|
23 |
+
return True
|
24 |
+
except Exception:
|
25 |
+
return False
|
26 |
+
|
27 |
+
|
28 |
+
def is_half(device: str) -> bool:
|
29 |
+
if not device.startswith('cuda'):
|
30 |
+
return False
|
31 |
+
else:
|
32 |
+
gpu_name = torch.cuda.get_device_name(
|
33 |
+
int(device.split(':')[-1])
|
34 |
+
).upper()
|
35 |
+
|
36 |
+
# ...regex?
|
37 |
+
if (
|
38 |
+
('16' in gpu_name and 'V100' not in gpu_name)
|
39 |
+
or 'P40' in gpu_name
|
40 |
+
or '1060' in gpu_name
|
41 |
+
or '1070' in gpu_name
|
42 |
+
or '1080' in gpu_name
|
43 |
+
):
|
44 |
+
return False
|
45 |
+
|
46 |
+
return True
|
47 |
+
|
48 |
+
|
49 |
+
def load_hubert_model(device: str, model_path: str = 'hubert_base.pt'):
|
50 |
+
model = checkpoint_utils.load_model_ensemble_and_task(
|
51 |
+
[model_path]
|
52 |
+
)[0][0].to(device)
|
53 |
+
|
54 |
+
if is_half(device):
|
55 |
+
return model.half()
|
56 |
+
else:
|
57 |
+
return model.float()
|
58 |
+
|
59 |
+
|
60 |
+
async def call_edge_tts(speaker_name: str, text: str):
|
61 |
+
tts_com = edge_tts.Communicate(text, speaker_name)
|
62 |
+
tts_raw = b''
|
63 |
+
|
64 |
+
# Stream TTS audio to bytes
|
65 |
+
async for chunk in tts_com.stream():
|
66 |
+
if chunk['type'] == 'audio':
|
67 |
+
tts_raw += chunk['data']
|
68 |
+
|
69 |
+
# Convert mp3 stream to wav
|
70 |
+
ffmpeg_proc = await asyncio.create_subprocess_exec(
|
71 |
+
'ffmpeg',
|
72 |
+
'-f', 'mp3',
|
73 |
+
'-i', '-',
|
74 |
+
'-f', 'wav',
|
75 |
+
'-loglevel', 'error',
|
76 |
+
'-',
|
77 |
+
stdin=asyncio.subprocess.PIPE,
|
78 |
+
stdout=asyncio.subprocess.PIPE
|
79 |
+
)
|
80 |
+
(tts_wav, _) = await ffmpeg_proc.communicate(tts_raw)
|
81 |
+
|
82 |
+
return librosa.load(BytesIO(tts_wav))
|
83 |
+
|
84 |
+
|
85 |
+
async def call_edge_tts_config(speaker_name: str, text: str, rate: str, volume: str):
|
86 |
+
tts_com = edge_tts.Communicate(text=text, voice=speaker_name, rate=rate, volume=volume)
|
87 |
+
tts_raw = b''
|
88 |
+
|
89 |
+
# Stream TTS audio to bytes
|
90 |
+
async for chunk in tts_com.stream():
|
91 |
+
if chunk['type'] == 'audio':
|
92 |
+
tts_raw += chunk['data']
|
93 |
+
|
94 |
+
# Convert mp3 stream to wav
|
95 |
+
ffmpeg_proc = await asyncio.create_subprocess_exec(
|
96 |
+
'ffmpeg',
|
97 |
+
'-f', 'mp3',
|
98 |
+
'-i', '-',
|
99 |
+
'-f', 'wav',
|
100 |
+
'-loglevel', 'error',
|
101 |
+
'-',
|
102 |
+
stdin=asyncio.subprocess.PIPE,
|
103 |
+
stdout=asyncio.subprocess.PIPE
|
104 |
+
)
|
105 |
+
(tts_wav, _) = await ffmpeg_proc.communicate(tts_raw)
|
106 |
+
|
107 |
+
return librosa.load(BytesIO(tts_wav))
|