mantrakp commited on
Commit
7f167fb
1 Parent(s): dc868ec

Add sox and libsox-dev to requirements.txt

Browse files
.gitmodules ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [submodule "tabs/audios/modules/CosyVoice"]
2
+ path = tabs/audios/modules/CosyVoice
3
+ url = https://github.com/FunAudioLLM/CosyVoice.git
config.py CHANGED
@@ -4,8 +4,10 @@ import json
4
  import torch
5
 
6
 
7
- # Setup Directories
8
- os.makedirs('.cache', exist_ok=True)
 
 
9
 
10
 
11
  css = """
@@ -32,9 +34,10 @@ body {
32
  class Config:
33
  # General
34
  SECRET_KEY = os.environ.get('SECRET_KEY', '12345678')
 
 
35
 
36
  # Images
37
- # IMAGE_MODELS = ["black-forest-labs/FLUX.1-dev", "stabilityai/stable-diffusion-xl-base-1.0"]
38
  IMAGES_MODELS = [{"repo_id": "black-forest-labs/FLUX.1-dev", "loader": "flux", "compute_type": torch.bfloat16,}, {"repo_id": "stabilityai/stable-diffusion-xl-base-1.0", "loader": "sdxl", "compute_type": torch.float16,}]
39
  with open('data/loras/sdxl.json') as f:
40
  IMAGES_LORAS_SDXL = json.load(f)
@@ -80,4 +83,4 @@ class Config:
80
 
81
 
82
  # Audios
83
- AUDIOS_MODELS = [{"repo_id": "fal/AuraSR-v2"}]
 
4
  import torch
5
 
6
 
7
+ # Setup Repo
8
+
9
+ # Audios
10
+ os.environ['PYTHONPATH'] = f'{os.path.dirname(__file__)}/modules/CosyVoice/third_party/Matcha-TTS:{os.environ.get("PYTHONPATH", "")}' # add tabs/audios/modules/CosyVoice/third_party/Matcha-TTS to PYTHONPATH
11
 
12
 
13
  css = """
 
34
  class Config:
35
  # General
36
  SECRET_KEY = os.environ.get('SECRET_KEY', '12345678')
37
+ MODEL_DOWNLOAD_DIR = os.environ.get('HF_HOME', os.environ.get('HF_HUB_CACHE', '/.cache'))
38
+ os.makedirs(MODEL_DOWNLOAD_DIR, exist_ok=True)
39
 
40
  # Images
 
41
  IMAGES_MODELS = [{"repo_id": "black-forest-labs/FLUX.1-dev", "loader": "flux", "compute_type": torch.bfloat16,}, {"repo_id": "stabilityai/stable-diffusion-xl-base-1.0", "loader": "sdxl", "compute_type": torch.float16,}]
42
  with open('data/loras/sdxl.json') as f:
43
  IMAGES_LORAS_SDXL = json.load(f)
 
83
 
84
 
85
  # Audios
86
+ AUDIOS_MODELS = []
packages.txt CHANGED
@@ -1,2 +1,4 @@
1
  ffmpeg
2
  libgl1-mesa-glx
 
 
 
1
  ffmpeg
2
  libgl1-mesa-glx
3
+ sox
4
+ libsox-dev
requirements.txt CHANGED
@@ -1,4 +1,3 @@
1
- --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/
2
  spaces
3
  gradio
4
  torch
@@ -15,7 +14,6 @@ mediapipe
15
  controlnet_aux
16
  insightface
17
  omegaconf
18
- git+https://github.com/TencentARC/PhotoMaker.git
19
  torchao
20
  git+https://github.com/xhinker/sd_embed.git
21
  clip_interrogator
@@ -24,4 +22,28 @@ git+https://github.com/TencentARC/GFPGAN.git
24
  git+https://github.com/xinntao/Real-ESRGAN.git
25
  aura_sr
26
  deepfilternet
27
- styletts2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  spaces
2
  gradio
3
  torch
 
14
  controlnet_aux
15
  insightface
16
  omegaconf
 
17
  torchao
18
  git+https://github.com/xhinker/sd_embed.git
19
  clip_interrogator
 
22
  git+https://github.com/xinntao/Real-ESRGAN.git
23
  aura_sr
24
  deepfilternet
25
+ conformer
26
+ deepspeed
27
+ gdown
28
+ grpcio
29
+ grpcio-tools
30
+ hydra-core
31
+ HyperPyYAML
32
+ inflect
33
+ librosa
34
+ lightning
35
+ matplotlib
36
+ modelscope
37
+ networkx
38
+ onnx
39
+ openai-whisper
40
+ protobuf
41
+ pydantic
42
+ rich
43
+ soundfile
44
+ tensorboard
45
+ WeTextProcessing
46
+ wget
47
+ fastapi-cli
48
+ spacy
49
+ spacy_langdetect
tabs/audios/events.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import os
2
  import gc
3
  import tempfile
@@ -5,11 +6,13 @@ from uuid import uuid4
5
 
6
  import spaces
7
  import gradio as gr
 
8
  import numpy as np
9
  from df.enhance import enhance, load_audio, save_audio
10
 
11
  from config import Config
12
  from .load_models import *
 
13
 
14
 
15
  # Helper functions
@@ -17,6 +20,103 @@ def create_temp_file():
17
  return tempfile.NamedTemporaryFile(delete=False)
18
 
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  @spaces.GPU(duration=10)
21
  def clear_audio(audio: np.ndarray):
22
  # Save the audio file
@@ -36,30 +136,76 @@ def clear_audio(audio: np.ndarray):
36
 
37
 
38
  @spaces.GPU(duration=20)
39
- def gen_audio(
40
- text,
41
- language,
42
- speaker_audio: np.ndarray,
43
- tts_alpha,
44
- tts_beta,
45
- tts_diffusion_steps,
46
- tts_embedding_scale,
47
- ):
48
- # Save the speaker audio file
49
- speaker_audio_file = create_temp_file()
50
- np.save(speaker_audio_file.name, speaker_audio)
51
 
52
  # Generate the audio
53
- output = styletts2_model.inference(
54
- text=text,
55
- target_voice_path=speaker_audio_file.name,
56
- output_wav_file=create_temp_file().name,
57
- alpha=float(tts_alpha),
58
- beta=float(tts_beta),
59
- diffusion_steps=int(tts_diffusion_steps),
60
- embedding_scale=int(tts_embedding_scale),
61
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
  return gr.update( # output_audio
64
- value=output,
65
  )
 
1
+ import re
2
  import os
3
  import gc
4
  import tempfile
 
6
 
7
  import spaces
8
  import gradio as gr
9
+ import torchaudio
10
  import numpy as np
11
  from df.enhance import enhance, load_audio, save_audio
12
 
13
  from config import Config
14
  from .load_models import *
15
+ from .modules.CosyVoice.cosyvoice.utils.file_utils import load_wav
16
 
17
 
18
  # Helper functions
 
20
  return tempfile.NamedTemporaryFile(delete=False)
21
 
22
 
23
+
24
+ def assign_language_tags(text):
25
+ # Process the text
26
+ # based on the language assign <|zh|><|en|><|jp|><|yue|><|ko|> for Chinese/English/Japanese/Cantonese/Korean
27
+ # at the start of the text for that language
28
+ # e.g. input: 你好 Hello こんにちは 你好 안녕하세요
29
+ # output: <|zh|>你好<|en|>Hello<|jp|>こんにちは<|yue|>你好<|ko|>안녕하세요
30
+ # Define language patterns
31
+ patterns = {
32
+ 'zh': r'[\u4e00-\u9fff]+', # Chinese characters
33
+ 'en': r'[a-zA-Z]+', # English letters
34
+ 'jp': r'[\u3040-\u30ff\u31f0-\u31ff]+', # Japanese characters
35
+ 'ko': r'[\uac00-\ud7a3]+', # Korean characters
36
+ }
37
+
38
+ # Find all matches
39
+ matches = []
40
+ for lang, pattern in patterns.items():
41
+ for match in re.finditer(pattern, text):
42
+ matches.append((match.start(), match.end(), lang, match.group()))
43
+
44
+ # Sort matches by start position
45
+ matches.sort(key=lambda x: x[0])
46
+
47
+ # Build the result string
48
+ result = []
49
+ last_end = 0
50
+ zh_count = 0
51
+ for start, end, lang, content in matches:
52
+ if start > last_end:
53
+ result.append(text[last_end:start])
54
+ if lang == 'zh':
55
+ zh_count += 1
56
+ if zh_count > 1:
57
+ lang = 'yue'
58
+ result.append(f'<|{lang}|>{content}')
59
+ last_end = end
60
+
61
+ if last_end < len(text):
62
+ result.append(text[last_end:])
63
+
64
+ return ''.join(result)
65
+
66
+
67
+ def update_mode(mode, sft_speaker, speaker_audio, voice_instructions):
68
+ if mode == 'SFT':
69
+ return (
70
+ gr.update( # sft_speaker
71
+
72
+ ),
73
+ gr.update( # speaker_audio,
74
+ visible=False,
75
+ ),
76
+ gr.update( # voice_instructions,
77
+ visible=False,
78
+ ),
79
+ )
80
+ elif mode == 'VC':
81
+ return (
82
+ gr.update( # sft_speaker,
83
+ visible=False,
84
+ ),
85
+ gr.update( # speaker_audio,
86
+ visible=True,
87
+ ),
88
+ gr.update( # voice_instructions,
89
+ visible=True,
90
+ ),
91
+ )
92
+ elif mode == 'VC-CrossLingual':
93
+ return (
94
+ gr.update( # sft_speaker,
95
+ visible=False,
96
+ ),
97
+ gr.update( # speaker_audio,
98
+ visible=True,
99
+ ),
100
+ gr.update( # voice_instructions,
101
+ visible=False,
102
+ ),
103
+ )
104
+ elif mode == 'Instruct':
105
+ return (
106
+ gr.update( # sft_speaker,
107
+ visible=True,
108
+ ),
109
+ gr.update( # speaker_audio,
110
+ visible=False,
111
+ ),
112
+ gr.update( # voice_instructions,
113
+ visible=True,
114
+ ),
115
+ )
116
+ else:
117
+ raise gr.Error('Invalid mode')
118
+
119
+
120
  @spaces.GPU(duration=10)
121
  def clear_audio(audio: np.ndarray):
122
  # Save the audio file
 
136
 
137
 
138
  @spaces.GPU(duration=20)
139
+ def gen_audio(text, mode, sft_speaker = None, speaker_audio = None, voice_instructions = None):
140
+ if mode == any(['VC', 'VC-CrossLingual']):
141
+ # Save the speaker audio file
142
+ speaker_audio_file = create_temp_file()
143
+ np.save(speaker_audio_file.name, speaker_audio)
144
+ prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
145
+ else:
146
+ speaker_audio_file = None
147
+ prompt_speech_16k = None
148
+
149
+ # Assign language tags
150
+ text = assign_language_tags(text)
151
 
152
  # Generate the audio
153
+ out_file = create_temp_file()
154
+ if mode == 'SFT':
155
+ if not sft_speaker:
156
+ raise gr.Error('Please select a speaker')
157
+
158
+ for i, j in enumerate(cv_base.inference_sft(
159
+ tts_text=text,
160
+ spk_id=sft_speaker,
161
+ )):
162
+ torchaudio.save(
163
+ out_file.name.format(i),
164
+ j['tts_speech'],
165
+ 22050,
166
+ )
167
+ elif mode == 'VC':
168
+ if not speaker_audio_file:
169
+ raise gr.Error('Please upload an audio')
170
+
171
+ for i, j in enumerate(cv_sft.inference_zero_shot(
172
+ tts_text=text,
173
+ prompt_speech_16k=voice_instructions,
174
+ prompt_speech_16k=prompt_speech_16k,
175
+ )):
176
+ torchaudio.save(
177
+ out_file.name.format(i),
178
+ j['tts_speech'],
179
+ 22050,
180
+ )
181
+ elif mode == 'VC-CrossLingual':
182
+ if not speaker_audio_file:
183
+ raise gr.Error('Please upload an audio')
184
+
185
+ for i, j in enumerate(cv_sft.inference_cross_lingual(
186
+ tts_text=text,
187
+ prompt_speech_16k=prompt_speech_16k,
188
+ )):
189
+ torchaudio.save(
190
+ out_file.name.format(i),
191
+ j['tts_speech'],
192
+ 22050,
193
+ )
194
+ elif mode == 'Instruct':
195
+ if not voice_instructions:
196
+ raise gr.Error('Please enter voice instructions')
197
+
198
+ for i, j in enumerate(cv_instruct.inference_instruct(
199
+ tts_text=text,
200
+ spk_id=sft_speaker,
201
+ instruct_text=voice_instructions,
202
+ )):
203
+ torchaudio.save(
204
+ out_file.name.format(i),
205
+ j['tts_speech'],
206
+ 22050,
207
+ )
208
 
209
  return gr.update( # output_audio
210
+ value=out_file.name,
211
  )
tabs/audios/load_models.py CHANGED
@@ -1,17 +1,38 @@
 
 
1
  import torch
2
  from df.enhance import init_df
3
- from styletts2 import tts
4
 
5
  from config import Config
 
6
 
7
 
8
  def init_sys():
9
  device = "cuda" if torch.cuda.is_available() else "cpu"
10
 
 
11
  df_model, df_state, _ = init_df()
12
 
13
- styletts2_model = tts.StyleTTS2()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
- return device, df_model, df_state, styletts2_model
16
 
17
- device, df_model, df_state, styletts2_model = init_sys()
 
1
+ import os
2
+
3
  import torch
4
  from df.enhance import init_df
5
+ from modelscope import snapshot_download
6
 
7
  from config import Config
8
+ from .modules.CosyVoice.cosyvoice.cli.cosyvoice import CosyVoice
9
 
10
 
11
  def init_sys():
12
  device = "cuda" if torch.cuda.is_available() else "cpu"
13
 
14
+ # Load DeepFilterNet2
15
  df_model, df_state, _ = init_df()
16
 
17
+ # Download CosyVoice models
18
+ snapshot_download('iic/CosyVoice-300M', local_dir=f'{Config.MODEL_DOWNLOAD_DIR}/audios/CosyVoice-300M')
19
+ snapshot_download('iic/CosyVoice-300M-SFT', local_dir=f'{Config.MODEL_DOWNLOAD_DIR}/audios/CosyVoice-300M-SFT')
20
+ snapshot_download('iic/CosyVoice-300M-Instruct', local_dir=f'{Config.MODEL_DOWNLOAD_DIR}/audios/CosyVoice-300M-Instruct')
21
+ snapshot_download('iic/CosyVoice-ttsfrd', local_dir=f'{Config.MODEL_DOWNLOAD_DIR}/audios/CosyVoice-ttsfrd')
22
+
23
+ # Add `tabs/audios/modules/CosyVoice/third_party/Matcha-TTS` to your `PYTHONPATH`
24
+ os.environ['PYTHONPATH'] = f'{os.path.dirname(__file__)}/modules/CosyVoice/third_party/Matcha-TTS:{os.environ.get("PYTHONPATH", "")}'
25
+
26
+ # Load CosyVoice TTS
27
+ cv_base = CosyVoice('pretrained_models/CosyVoice-300M')
28
+
29
+ # Load CosyVoice SFT
30
+ cv_sft = CosyVoice('pretrained_models/CosyVoice-300M-SFT')
31
+ sft_speakers = cv_sft.list_avaliable_spks()
32
+
33
+ # Load CosyVoice Instruct
34
+ cv_instruct = CosyVoice('pretrained_models/CosyVoice-300M-Instruct')
35
 
36
+ return device, df_model, df_state, cv_base, cv_sft, sft_speakers, cv_instruct
37
 
38
+ device, df_model, df_state, cv_base, cv_sft, sft_speakers, cv_instruct = init_sys()
tabs/audios/modules/CosyVoice ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit c901a12789e0a9d8cec54c3caf1bc304533bdf82
tabs/audios/ui.py CHANGED
@@ -10,30 +10,19 @@ def audio_tab():
10
  with gr.Group():
11
  with gr.Group():
12
  text = gr.Textbox(lines=5, label="Enter text")
13
- language = gr.Dropdown(
14
- label="Language",
15
- choices=["en"],
16
- value="en",
17
- )
18
-
19
- with gr.Accordion('Voice Clone', open=True):
20
- speaker_audio = gr.Audio(label="Upload Audio", type='numpy')
21
  clear_speaker_audio = gr.Button(label="Clear Audio")
 
 
 
22
 
23
  with gr.Column():
24
  output_audio = gr.Audio(label="Output Audio", interactive=False, show_download_button=True)
25
  clear_output_audio = gr.Button(label="Clear Audio")
26
  generate_audio = gr.Button(label="Generate Audio")
27
-
28
- with gr.Accordion('Advance Settings', open=True):
29
- settings = [
30
- ('Alpha', 'tts_alpha', 'float', 0.0, 1.0, 0.3, 0.1,),
31
- ('Beta', 'tts_beta', 'float', 0.0, 1.0, 0.7, 0.1,),
32
- ('Diffusion Steps', 'tts_diffusion_steps', 'int', 1, 100, 10, 1,),
33
- ('Embedding Scale', 'tts_embedding_scale', 'int', 0, 10, 1, 1,),
34
- ]
35
- for label, key, type_, min_, max_, value, step in settings:
36
- globals()[key] = gr.Slider(label=label, minimum=min_, maximum=max_, value=value, step=step)
37
 
38
 
39
  # Events
@@ -41,9 +30,16 @@ def audio_tab():
41
  clear_speaker_audio.click(clear_audio, speaker_audio, speaker_audio)
42
  clear_output_audio.click(clear_audio, output_audio, output_audio)
43
 
 
 
 
 
 
 
 
44
  # Generate Audio
45
  generate_audio.click(
46
  gen_audio,
47
- [text, language, speaker_audio, tts_alpha, tts_beta, tts_diffusion_steps, tts_embedding_scale], # type: ignore
48
  [output_audio]
49
  )
 
10
  with gr.Group():
11
  with gr.Group():
12
  text = gr.Textbox(lines=5, label="Enter text")
13
+ mode = gr.Radio(["SFT", "VC", "VC-CrossLingual", "Instruct"], label="Mode", value="SFT",) # automate with speech recognition pipeline
14
+ sft_speaker = gr.Radio(sft_speakers, label="Select speaker")
15
+ with gr.Accordion('Voice Clone', open=False):
16
+ speaker_audio = gr.Audio(label="Upload Audio", type='numpy', visible=False)
 
 
 
 
17
  clear_speaker_audio = gr.Button(label="Clear Audio")
18
+
19
+ with gr.Accordion('Instruct', open=False):
20
+ voice_instructions = gr.Textbox(lines=5, label="Enter voice instructions", visible=False)
21
 
22
  with gr.Column():
23
  output_audio = gr.Audio(label="Output Audio", interactive=False, show_download_button=True)
24
  clear_output_audio = gr.Button(label="Clear Audio")
25
  generate_audio = gr.Button(label="Generate Audio")
 
 
 
 
 
 
 
 
 
 
26
 
27
 
28
  # Events
 
30
  clear_speaker_audio.click(clear_audio, speaker_audio, speaker_audio)
31
  clear_output_audio.click(clear_audio, output_audio, output_audio)
32
 
33
+ # Mode
34
+ mode.change(
35
+ update_mode,
36
+ [mode, sft_speaker, speaker_audio, voice_instructions],
37
+ [sft_speaker, speaker_audio, voice_instructions]
38
+ )
39
+
40
  # Generate Audio
41
  generate_audio.click(
42
  gen_audio,
43
+ [text, mode, sft_speaker, speaker_audio, voice_instructions],
44
  [output_audio]
45
  )