Spaces:

dmeck
/

RVC-Speakers

Running

App Files Files Community

glide-the commited on Aug 24, 2023

Commit

8953210

1 Parent(s): ffe5bd7

Add large files to Git LFS

Browse files

Files changed (6) hide show

Dockerfile +21 -0
README.md +11 -0
requirements.txt +52 -0
setup.py +36 -0
start.py +4 -0
util.py +107 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,21 @@

+FROM python:3.10
+RUN apt update && apt install -y cmake gcc portaudio19-dev
+WORKDIR /code
+ENV NUMBA_CACHE_DIR=/tmp/
+COPY ./requirements.txt /code/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+COPY . /code/
+RUN pip install -e .
+RUN cd /code/vits/monotonic_align && \
+    mkdir -p /code/vits/monotonic_align/vits/monotonic_align/ && \
+    python setup.py build_ext --inplace && \
+    mv /code/vits/monotonic_align/vits/monotonic_align/* /code/vits/monotonic_align/
+CMD ["python", "-m", "speakers", "--verbose", "--mode", "web"]
+EXPOSE 7860

README.md ADDED Viewed

	@@ -0,0 +1,11 @@

+---
+title: RVC Speakers
+emoji: 📚
+colorFrom: green
+colorTo: red
+sdk: docker
+pinned: false
+license: bsd-3-clause
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

requirements.txt ADDED Viewed

	@@ -0,0 +1,52 @@

+Cython==0.29.21
+# vits_text
+unidecode
+# pre uninstall cmake
+pyopenjtalk
+jamo
+pypinyin
+jieba
+cn2an
+# vits_text
+nest_asyncio
+gradio==3.33.1
+fairseq
+torch
+torchaudio
+soundfile
+scipy==1.9.3
+librosa==0.9.1
+musicdl
+pyaudio
+torchcrepe==0.0.20
+praat-parselmouth>=0.4.2
+pyworld==0.3.2
+faiss-cpu==1.7.3
+numpy==1.23.5
+nltk
+edge-tts
+IPython
+tqdm
+pandas
+## bark
+transformers
+encodec
+huggingface-hub>=0.14.1
+funcy
+# config manage
+omegaconf
+pydantic
+# log
+colorama
+# server
+fastapi~=0.99.1
+starlette~=0.27.0
+uvicorn~=0.23.1
+requests
+oscrypto

setup.py ADDED Viewed

	@@ -0,0 +1,36 @@

+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+from setuptools import setup, find_namespace_packages
+import platform
+DEPENDENCY_LINKS = []
+if platform.system() == "Windows":
+    DEPENDENCY_LINKS.append("https://download.pytorch.org/whl/torch_stable.html")
+def fetch_requirements(filename):
+    with open(filename) as f:
+        return [ln.strip() for ln in f.read().split("\n")]
+setup(
+    name="speakers",
+    version="0.0.1",
+    author="glide-the",
+    description="Ready Voice Controller ， generate for End-to-End Text-to-Speech，with Multi-engine integration",
+    long_description=open("README.md", "r", encoding="utf-8").read(),
+    long_description_content_type="text/markdown",
+    keywords="Speakers, Multimodal, Ready Voice Controller",
+    license="3-Clause BSD",
+    packages=find_namespace_packages(include="speakers.*"),
+    install_requires=fetch_requirements("requirements.txt"),
+    python_requires=">=3.9.0",
+    include_package_data=True,
+    dependency_links=DEPENDENCY_LINKS,
+    zip_safe=False,
+)

start.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from speakers.__main__ import main
+if __name__ == '__main__':
+    main()

util.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import sys
+import asyncio
+from io import BytesIO
+from fairseq import checkpoint_utils
+import torch
+import edge_tts
+import librosa
+# https://github.com/fumiama/Retrieval-based-Voice-Conversion-WebUI/blob/main/config.py#L43-L55  # noqa
+def has_mps() -> bool:
+    if sys.platform != "darwin":
+        return False
+    else:
+        if not getattr(torch, 'has_mps', False):
+            return False
+        try:
+            torch.zeros(1).to(torch.device("mps"))
+            return True
+        except Exception:
+            return False
+def is_half(device: str) -> bool:
+    if not device.startswith('cuda'):
+        return False
+    else:
+        gpu_name = torch.cuda.get_device_name(
+            int(device.split(':')[-1])
+        ).upper()
+        # ...regex?
+        if (
+            ('16' in gpu_name and 'V100' not in gpu_name)
+            or 'P40' in gpu_name
+            or '1060' in gpu_name
+            or '1070' in gpu_name
+            or '1080' in gpu_name
+        ):
+            return False
+    return True
+def load_hubert_model(device: str, model_path: str = 'hubert_base.pt'):
+    model = checkpoint_utils.load_model_ensemble_and_task(
+        [model_path]
+    )[0][0].to(device)
+    if is_half(device):
+        return model.half()
+    else:
+        return model.float()
+async def call_edge_tts(speaker_name: str, text: str):
+    tts_com = edge_tts.Communicate(text, speaker_name)
+    tts_raw = b''
+    # Stream TTS audio to bytes
+    async for chunk in tts_com.stream():
+        if chunk['type'] == 'audio':
+            tts_raw += chunk['data']
+    # Convert mp3 stream to wav
+    ffmpeg_proc = await asyncio.create_subprocess_exec(
+        'ffmpeg',
+        '-f', 'mp3',
+        '-i', '-',
+        '-f', 'wav',
+        '-loglevel', 'error',
+        '-',
+        stdin=asyncio.subprocess.PIPE,
+        stdout=asyncio.subprocess.PIPE
+    )
+    (tts_wav, _) = await ffmpeg_proc.communicate(tts_raw)
+    return librosa.load(BytesIO(tts_wav))
+async def call_edge_tts_config(speaker_name: str, text: str, rate: str, volume: str):
+    tts_com = edge_tts.Communicate(text=text, voice=speaker_name, rate=rate, volume=volume)
+    tts_raw = b''
+    # Stream TTS audio to bytes
+    async for chunk in tts_com.stream():
+        if chunk['type'] == 'audio':
+            tts_raw += chunk['data']
+    # Convert mp3 stream to wav
+    ffmpeg_proc = await asyncio.create_subprocess_exec(
+        'ffmpeg',
+        '-f', 'mp3',
+        '-i', '-',
+        '-f', 'wav',
+        '-loglevel', 'error',
+        '-',
+        stdin=asyncio.subprocess.PIPE,
+        stdout=asyncio.subprocess.PIPE
+    )
+    (tts_wav, _) = await ffmpeg_proc.communicate(tts_raw)
+    return librosa.load(BytesIO(tts_wav))