theNeofr commited on
Commit
1036aef
·
verified ·
1 Parent(s): eea7afd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1045 -338
app.py CHANGED
@@ -1,9 +1,9 @@
1
  import os
2
  import torch
3
- import shutil
4
  import logging
 
 
5
  import gradio as gr
6
-
7
  from audio_separator.separator import Separator
8
 
9
  device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -12,80 +12,80 @@ use_autocast = device == "cuda"
12
  #=========================#
13
  # Roformer Models #
14
  #=========================#
15
- ROFORMER_MODELS = {
16
- 'BS-Roformer-De-Reverb': 'deverb_bs_roformer_8_384dim_10depth.ckpt',
17
- 'BS-Roformer-Viperx-1053': 'model_bs_roformer_ep_937_sdr_10.5309.ckpt',
18
- 'BS-Roformer-Viperx-1296': 'model_bs_roformer_ep_368_sdr_12.9628.ckpt',
19
  'BS-Roformer-Viperx-1297': 'model_bs_roformer_ep_317_sdr_12.9755.ckpt',
 
 
 
 
20
  'Mel-Roformer-Crowd-Aufr33-Viperx': 'mel_band_roformer_crowd_aufr33_viperx_sdr_8.7144.ckpt',
21
  'Mel-Roformer-Denoise-Aufr33': 'denoise_mel_band_roformer_aufr33_sdr_27.9959.ckpt',
22
- 'Mel-Roformer-Denoise-Aufr33-Aggr': 'denoise_mel_band_roformer_aufr33_aggr_sdr_27.9768.ckpt',
23
  'Mel-Roformer-Karaoke-Aufr33-Viperx': 'mel_band_roformer_karaoke_aufr33_viperx_sdr_10.1956.ckpt',
24
- 'Mel-Roformer-Viperx-1143': 'model_mel_band_roformer_ep_3005_sdr_11.4360.ckpt',
25
- 'MelBand Roformer Kim | Inst V1 by Unwa': 'melband_roformer_inst_v1.ckpt',
26
- 'MelBand Roformer Kim | Inst V2 by Unwa': 'melband_roformer_inst_v2.ckpt',
27
- 'MelBand Roformer Kim | InstVoc Duality V1 by Unwa': 'melband_roformer_instvoc_duality_v1.ckpt',
28
- 'MelBand Roformer Kim | InstVoc Duality V2 by Unwa': 'melband_roformer_instvox_duality_v2.ckpt',
29
- 'Vocals Mel Band Roformer': 'vocals_mel_band_roformer.ckpt',
30
- 'Mel Band Roformer Bleed Suppressor V1': 'mel_band_roformer_bleed_suppressor_v1.ckpt',
31
- 'Mel Band Roformer SYHFT V2': 'MelBandRoformerSYHFTV2.ckpt',
32
- 'Mel Band Roformer SYHFT V2.5': 'MelBandRoformerSYHFTV2.5.ckpt',
33
  }
 
34
  #=========================#
35
  # MDX23C Models #
36
  #=========================#
37
- MDX23C_MODELS = [
 
38
  'MDX23C-8KFFT-InstVoc_HQ.ckpt',
39
  'MDX23C-8KFFT-InstVoc_HQ_2.ckpt',
40
- 'MDX23C_D1581.ckpt',
41
  ]
 
42
  #=========================#
43
  # MDXN-NET Models #
44
  #=========================#
45
- MDXNET_MODELS = [
46
- 'UVR-MDX-NET-Crowd_HQ_1.onnx',
47
- 'UVR-MDX-NET-Inst_1.onnx',
48
- 'UVR-MDX-NET-Inst_2.onnx',
49
- 'UVR-MDX-NET-Inst_3.onnx',
50
- 'UVR-MDX-NET-Inst_HQ_1.onnx',
51
- 'UVR-MDX-NET-Inst_HQ_2.onnx',
52
- 'UVR-MDX-NET-Inst_HQ_3.onnx',
53
- 'UVR-MDX-NET-Inst_HQ_4.onnx',
54
- 'UVR-MDX-NET-Inst_HQ_5.onnx',
55
  'UVR-MDX-NET-Inst_full_292.onnx',
56
- 'UVR-MDX-NET-Voc_FT.onnx',
57
  'UVR-MDX-NET_Inst_82_beta.onnx',
58
  'UVR-MDX-NET_Inst_90_beta.onnx',
59
- 'UVR-MDX-NET_Inst_187_beta.onnx',
60
  'UVR-MDX-NET_Main_340.onnx',
61
  'UVR-MDX-NET_Main_390.onnx',
62
  'UVR-MDX-NET_Main_406.onnx',
63
  'UVR-MDX-NET_Main_427.onnx',
64
  'UVR-MDX-NET_Main_438.onnx',
 
 
 
 
 
 
 
65
  'UVR_MDXNET_1_9703.onnx',
66
  'UVR_MDXNET_2_9682.onnx',
67
  'UVR_MDXNET_3_9662.onnx',
68
- 'UVR_MDXNET_9482.onnx',
 
 
69
  'UVR_MDXNET_KARA.onnx',
70
  'UVR_MDXNET_KARA_2.onnx',
71
- 'UVR_MDXNET_Main.onnx',
 
 
 
 
 
 
 
 
72
  'kuielab_a_bass.onnx',
73
  'kuielab_a_drums.onnx',
74
- 'kuielab_a_other.onnx',
75
- 'kuielab_a_vocals.onnx',
76
  'kuielab_b_bass.onnx',
77
  'kuielab_b_drums.onnx',
78
- 'kuielab_b_other.onnx',
79
- 'kuielab_b_vocals.onnx',
80
- 'Kim_Inst.onnx',
81
- 'Kim_Vocal_1.onnx',
82
- 'Kim_Vocal_2.onnx',
83
- 'Reverb_HQ_By_FoxJoy.onnx',
84
  ]
 
85
  #========================#
86
  # VR-ARCH Models #
87
  #========================#
88
- VR_ARCH_MODELS = [
89
  '1_HP-UVR.pth',
90
  '2_HP-UVR.pth',
91
  '3_HP-Vocal-UVR.pth',
@@ -103,168 +103,186 @@ VR_ARCH_MODELS = [
103
  '15_SP-UVR-MID-44100-1.pth',
104
  '16_SP-UVR-MID-44100-2.pth',
105
  '17_HP-Wind_Inst-UVR.pth',
106
- 'MGM_HIGHEND_v4.pth',
107
- 'MGM_LOWEND_A_v4.pth',
108
- 'MGM_LOWEND_B_v4.pth',
109
- 'MGM_MAIN_v4.pth',
110
- 'UVR-BVE-4B_SN-44100-1.pth',
111
- 'UVR-DeEcho-DeReverb.pth',
112
  'UVR-De-Echo-Aggressive.pth',
113
  'UVR-De-Echo-Normal.pth',
 
114
  'UVR-DeNoise-Lite.pth',
115
  'UVR-DeNoise.pth',
 
 
 
 
 
116
  ]
 
117
  #=======================#
118
  # DEMUCS Models #
119
  #=======================#
120
- DEMUCS_MODELS = [
121
- 'hdemucs_mmi.yaml',
122
- 'htdemucs.yaml',
123
- 'htdemucs_6s.yaml',
124
  'htdemucs_ft.yaml',
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  ]
126
 
127
- def print_message(input_file, model_name):
128
- """Prints information about the audio separation process."""
129
- base_name = os.path.splitext(os.path.basename(input_file))[0]
130
- print("\n")
131
- print("🎵 Audio-Separator 🎵")
132
- print("Input audio:", base_name)
133
- print("Separation Model:", model_name)
134
- print("Audio Separation Process...")
135
-
136
- def prepare_output_dir(input_file, output_dir):
137
- """Create a directory for the output files and clean it if it already exists."""
138
- base_name = os.path.splitext(os.path.basename(input_file))[0]
139
- out_dir = os.path.join(output_dir, base_name)
 
 
 
 
 
 
 
 
 
 
140
  try:
141
- if os.path.exists(out_dir):
142
- shutil.rmtree(out_dir)
143
- os.makedirs(out_dir)
 
 
 
 
 
 
 
 
 
 
144
  except Exception as e:
145
- raise RuntimeError(f"Failed to prepare output directory {out_dir}: {e}")
146
- return out_dir
147
 
148
- def roformer_separator(audio, model_key, seg_size, override_seg_size, overlap, pitch_shift, model_dir, out_dir, out_format, norm_thresh, amp_thresh, batch_size, progress=gr.Progress()):
149
- """Separate audio using Roformer model."""
150
  base_name = os.path.splitext(os.path.basename(audio))[0]
151
- print_message(audio, model_key)
152
- model = ROFORMER_MODELS[model_key]
153
  try:
154
- out_dir = prepare_output_dir(audio, out_dir)
155
  separator = Separator(
156
  log_level=logging.WARNING,
157
- model_file_dir=model_dir,
158
  output_dir=out_dir,
159
  output_format=out_format,
 
160
  normalization_threshold=norm_thresh,
161
  amplification_threshold=amp_thresh,
162
- use_autocast=use_autocast,
163
  mdxc_params={
164
- "segment_size": seg_size,
165
  "override_model_segment_size": override_seg_size,
166
  "batch_size": batch_size,
167
  "overlap": overlap,
168
- "pitch_shift": pitch_shift,
169
  }
170
  )
 
 
 
171
 
172
- progress(0.2, desc="Model loaded...")
173
- separator.load_model(model_filename=model)
174
-
175
- progress(0.7, desc="Audio separated...")
176
  separation = separator.separate(audio, f"{base_name}_(Stem1)", f"{base_name}_(Stem2)")
177
- print(f"Separation complete!\nResults: {', '.join(separation)}")
178
 
179
  stems = [os.path.join(out_dir, file_name) for file_name in separation]
180
  return stems[1], stems[0]
181
  except Exception as e:
182
  raise RuntimeError(f"Roformer separation failed: {e}") from e
183
 
184
- def mdx23c_separator(audio, model, seg_size, override_seg_size, overlap, pitch_shift, model_dir, out_dir, out_format, norm_thresh, amp_thresh, batch_size, progress=gr.Progress(track_tqdm=True)):
185
- """Separate audio using MDX23C model."""
186
  base_name = os.path.splitext(os.path.basename(audio))[0]
187
- print_message(audio, model)
188
  try:
189
- out_dir = prepare_output_dir(audio, out_dir)
190
  separator = Separator(
191
  log_level=logging.WARNING,
192
- model_file_dir=model_dir,
193
  output_dir=out_dir,
194
  output_format=out_format,
 
195
  normalization_threshold=norm_thresh,
196
  amplification_threshold=amp_thresh,
197
- use_autocast=use_autocast,
198
  mdxc_params={
199
- "segment_size": seg_size,
200
  "override_model_segment_size": override_seg_size,
201
  "batch_size": batch_size,
202
  "overlap": overlap,
203
- "pitch_shift": pitch_shift,
204
  }
205
  )
206
 
207
- progress(0.2, desc="Model loaded...")
208
  separator.load_model(model_filename=model)
209
 
210
- progress(0.7, desc="Audio separated...")
211
  separation = separator.separate(audio, f"{base_name}_(Stem1)", f"{base_name}_(Stem2)")
212
- print(f"Separation complete!\nResults: {', '.join(separation)}")
213
 
214
  stems = [os.path.join(out_dir, file_name) for file_name in separation]
215
  return stems[1], stems[0]
216
  except Exception as e:
217
  raise RuntimeError(f"MDX23C separation failed: {e}") from e
218
 
219
- def mdx_separator(audio, model, hop_length, seg_size, overlap, denoise, model_dir, out_dir, out_format, norm_thresh, amp_thresh, batch_size, progress=gr.Progress()):
220
- """Separate audio using MDX-NET model."""
221
  base_name = os.path.splitext(os.path.basename(audio))[0]
222
- print_message(audio, model)
223
  try:
224
- out_dir = prepare_output_dir(audio, out_dir)
225
  separator = Separator(
226
  log_level=logging.WARNING,
227
- model_file_dir=model_dir,
228
  output_dir=out_dir,
229
  output_format=out_format,
 
230
  normalization_threshold=norm_thresh,
231
  amplification_threshold=amp_thresh,
232
- use_autocast=use_autocast,
233
  mdx_params={
234
  "hop_length": hop_length,
235
- "segment_size": seg_size,
236
  "overlap": overlap,
237
  "batch_size": batch_size,
238
  "enable_denoise": denoise,
239
  }
240
  )
241
 
242
- progress(0.2, desc="Model loaded...")
243
  separator.load_model(model_filename=model)
244
 
245
- progress(0.7, desc="Audio separated...")
246
  separation = separator.separate(audio, f"{base_name}_(Stem1)", f"{base_name}_(Stem2)")
247
- print(f"Separation complete!\nResults: {', '.join(separation)}")
248
 
249
  stems = [os.path.join(out_dir, file_name) for file_name in separation]
250
  return stems[0], stems[1]
251
  except Exception as e:
252
  raise RuntimeError(f"MDX-NET separation failed: {e}") from e
253
 
254
- def vr_separator(audio, model, window_size, aggression, tta, post_process, post_process_threshold, high_end_process, model_dir, out_dir, out_format, norm_thresh, amp_thresh, batch_size, progress=gr.Progress()):
255
- """Separate audio using VR ARCH model."""
256
  base_name = os.path.splitext(os.path.basename(audio))[0]
257
- print_message(audio, model)
258
  try:
259
- out_dir = prepare_output_dir(audio, out_dir)
260
  separator = Separator(
261
  log_level=logging.WARNING,
262
- model_file_dir=model_dir,
263
  output_dir=out_dir,
264
  output_format=out_format,
 
265
  normalization_threshold=norm_thresh,
266
  amplification_threshold=amp_thresh,
267
- use_autocast=use_autocast,
268
  vr_params={
269
  "batch_size": batch_size,
270
  "window_size": window_size,
@@ -276,45 +294,43 @@ def vr_separator(audio, model, window_size, aggression, tta, post_process, post_
276
  }
277
  )
278
 
279
- progress(0.2, desc="Model loaded...")
280
  separator.load_model(model_filename=model)
281
 
282
- progress(0.7, desc="Audio separated...")
283
  separation = separator.separate(audio, f"{base_name}_(Stem1)", f"{base_name}_(Stem2)")
284
- print(f"Separation complete!\nResults: {', '.join(separation)}")
285
 
286
  stems = [os.path.join(out_dir, file_name) for file_name in separation]
287
  return stems[0], stems[1]
288
  except Exception as e:
289
  raise RuntimeError(f"VR ARCH separation failed: {e}") from e
290
 
291
- def demucs_separator(audio, model, seg_size, shifts, overlap, segments_enabled, model_dir, out_dir, out_format, norm_thresh, amp_thresh, progress=gr.Progress()):
292
- """Separate audio using Demucs model."""
293
- print_message(audio, model)
294
  try:
295
- out_dir = prepare_output_dir(audio, out_dir)
296
  separator = Separator(
297
  log_level=logging.WARNING,
298
- model_file_dir=model_dir,
299
  output_dir=out_dir,
300
  output_format=out_format,
 
301
  normalization_threshold=norm_thresh,
302
  amplification_threshold=amp_thresh,
303
- use_autocast=use_autocast,
304
  demucs_params={
305
- "segment_size": seg_size,
 
306
  "shifts": shifts,
307
  "overlap": overlap,
308
  "segments_enabled": segments_enabled,
309
  }
310
  )
311
 
312
- progress(0.2, desc="Model loaded...")
313
  separator.load_model(model_filename=model)
314
 
315
- progress(0.7, desc="Audio separated...")
316
  separation = separator.separate(audio)
317
- print(f"Separation complete!\nResults: {', '.join(separation)}")
318
 
319
  stems = [os.path.join(out_dir, file_name) for file_name in separation]
320
 
@@ -331,231 +347,922 @@ def update_stems(model):
331
  else:
332
  return gr.update(visible=False)
333
 
 
 
 
 
 
334
 
 
 
 
 
335
 
 
 
 
 
 
 
336
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
337
 
 
 
 
338
 
 
 
 
 
 
 
 
339
 
 
 
 
 
340
 
341
- with gr.Blocks(title="🎵 Audio-Separator 🎵",theme=gr.themes.Base()) as app:
342
- gr.HTML("<h1> 🎵 Audio-Separator 🎵 </h1>")
343
-
344
- with gr.Tab("Roformer"):
345
- with gr.Group():
346
- with gr.Row():
347
- roformer_model = gr.Dropdown(label="Select the Model", choices=list(ROFORMER_MODELS.keys()))
348
- with gr.Row():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
349
  with gr.Row():
350
- roformer_seg_size = gr.Slider(minimum=32, maximum=4000, step=32, value=256, label="Segment Size", info="Larger consumes more resources, but may give better results.")
351
- roformer_override_seg_size = gr.Checkbox(value=False, label="Override segment size", info="Override model default segment size instead of using the model default value.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352
  with gr.Row():
353
- roformer_overlap = gr.Slider(minimum=2, maximum=10, step=1, value=8, label="Overlap", info="Amount of overlap between prediction windows. Lower is better but slower.")
354
- roformer_pitch_shift = gr.Slider(minimum=-12, maximum=12, step=1, value=0, label="Pitch shift", info="Shift audio pitch by a number of semitones while processing. may improve output for deep/high vocals.")
355
- with gr.Row():
356
- roformer_audio = gr.Audio(label="Input Audio", type="filepath")
357
- with gr.Row():
358
- roformer_button = gr.Button("Separate!", variant="primary")
359
- with gr.Row():
360
- roformer_stem1 = gr.Audio(label="Stem 1", type="filepath", interactive=False)
361
- roformer_stem2 = gr.Audio(label="Stem 2", type="filepath", interactive=False)
362
-
363
- with gr.Tab("MDX23C"):
364
- with gr.Group():
365
- with gr.Row():
366
- mdx23c_model = gr.Dropdown(label="Select the Model", choices=MDX23C_MODELS)
367
- with gr.Row():
368
- mdx23c_seg_size = gr.Slider(minimum=32, maximum=4000, step=32, value=256, label="Segment Size", info="Larger consumes more resources, but may give better results.")
369
- mdx23c_override_seg_size = gr.Checkbox(value=False, label="Override segment size", info="Override model default segment size instead of using the model default value.")
370
- with gr.Row():
371
- mdx23c_overlap = gr.Slider(minimum=2, maximum=50, step=1, value=8, label="Overlap", info="Amount of overlap between prediction windows. Higher is better but slower.")
372
- mdx23c_pitch_shift = gr.Slider(minimum=-12, maximum=12, step=1, value=0, label="Pitch shift", info="Shift audio pitch by a number of semitones while processing. may improve output for deep/high vocals.")
373
- with gr.Row():
374
- mdx23c_audio = gr.Audio(label="Input Audio", type="filepath")
375
- with gr.Row():
376
- mdx23c_button = gr.Button("Separate!", variant="primary")
377
- with gr.Row():
378
- mdx23c_stem1 = gr.Audio(label="Stem 1", type="filepath", interactive=False)
379
- mdx23c_stem2 = gr.Audio(label="Stem 2", type="filepath", interactive=False)
380
-
381
- with gr.Tab("MDX-NET"):
382
- with gr.Group():
383
- with gr.Row():
384
- mdx_model = gr.Dropdown(label="Select the Model", choices=MDXNET_MODELS)
385
- with gr.Row():
386
- mdx_hop_length = gr.Slider(minimum=32, maximum=2048, step=32, value=1024, label="Hop Length", info="Usually called stride in neural networks; only change if you know what you're doing.")
387
- mdx_seg_size = gr.Slider(minimum=32, maximum=4000, step=32, value=256, label="Segment Size", info="Larger consumes more resources, but may give better results.")
388
- with gr.Row():
389
- mdx_overlap = gr.Slider(minimum=0.001, maximum=0.999, step=0.001, value=0.25, label="Overlap", info="Amount of overlap between prediction windows. Higher is better but slower.")
390
- mdx_denoise = gr.Checkbox(value=False, label="Denoise", info="Enable denoising after separation.")
391
- with gr.Row():
392
- mdx_audio = gr.Audio(label="Input Audio", type="filepath")
393
- with gr.Row():
394
- mdx_button = gr.Button("Separate!", variant="primary")
395
- with gr.Row():
396
- mdx_stem1 = gr.Audio(label="Stem 1", type="filepath", interactive=False)
397
- mdx_stem2 = gr.Audio(label="Stem 2", type="filepath", interactive=False)
398
-
399
- with gr.Tab("VR ARCH"):
400
- with gr.Group():
401
- with gr.Row():
402
- vr_model = gr.Dropdown(label="Select the Model", choices=VR_ARCH_MODELS)
403
- with gr.Row():
404
- vr_window_size = gr.Slider(minimum=320, maximum=1024, step=32, value=512, label="Window Size", info="Balance quality and speed. 1024 = fast but lower, 320 = slower but better quality.")
405
- vr_aggression = gr.Slider(minimum=1, maximum=50, step=1, value=5, label="Agression", info="Intensity of primary stem extraction.")
406
- with gr.Row():
407
- vr_tta = gr.Checkbox(value=False, label="TTA", info="Enable Test-Time-Augmentation; slow but improves quality.")
408
- with gr.Row():
409
- vr_post_process = gr.Checkbox(value=False, label="Post Process", info="Identify leftover artifacts within vocal output; may improve separation for some songs.")
410
- vr_post_process_threshold = gr.Slider(minimum=0.1, maximum=0.3, step=0.1, value=0.2, label="Post Process Threshold", info="Threshold for post-processing.")
411
- vr_high_end_process = gr.Checkbox(value=False, label="High End Process", info="Mirror the missing frequency range of the output.")
412
- with gr.Row():
413
- vr_audio = gr.Audio(label="Input Audio", type="filepath")
414
- with gr.Row():
415
- vr_button = gr.Button("Separate!", variant="primary")
416
- with gr.Row():
417
- vr_stem1 = gr.Audio(label="Stem 1", type="filepath", interactive=False)
418
- vr_stem2 = gr.Audio(label="Stem 2", type="filepath", interactive=False)
419
-
420
- with gr.Tab("Demucs"):
421
- with gr.Group():
422
- with gr.Row():
423
- demucs_model = gr.Dropdown(label="Select the Model", choices=DEMUCS_MODELS)
424
- with gr.Row():
425
- demucs_seg_size = gr.Slider(minimum=1, maximum=100, step=1, value=40, label="Segment Size", info="Size of segments into which the audio is split. Higher = slower but better quality.")
426
- demucs_shifts = gr.Slider(minimum=0, maximum=20, step=1, value=2, label="Shifts", info="Number of predictions with random shifts, higher = slower but better quality.")
427
- demucs_overlap = gr.Slider(minimum=0.001, maximum=0.999, step=0.001, value=0.25, label="Overlap", info="Overlap between prediction windows. Higher = slower but better quality.")
428
- demucs_segments_enabled = gr.Checkbox(value=True, label="Segment-wise processing", info="Enable segment-wise processing.")
429
- with gr.Row():
430
- demucs_audio = gr.Audio(label="Input Audio", type="filepath")
431
- with gr.Row():
432
- demucs_button = gr.Button("Separate!", variant="primary")
433
- with gr.Row():
434
- demucs_stem1 = gr.Audio(label="Stem 1", type="filepath", interactive=False)
435
- demucs_stem2 = gr.Audio(label="Stem 2", type="filepath", interactive=False)
436
- with gr.Row():
437
- demucs_stem3 = gr.Audio(label="Stem 3", type="filepath", interactive=False)
438
- demucs_stem4 = gr.Audio(label="Stem 4", type="filepath", interactive=False)
439
- with gr.Row(visible=False) as stem6:
440
- demucs_stem5 = gr.Audio(label="Stem 5", type="filepath", interactive=False)
441
- demucs_stem6 = gr.Audio(label="Stem 6", type="filepath", interactive=False)
442
-
443
-
444
- with gr.Tab("General settings"):
445
- with gr.Group():
446
- model_file_dir = gr.Textbox(value="/tmp/audio-separator-models/", label="Directory to cache model files", info="The directory where model files are stored.", placeholder="/tmp/audio-separator-models/")
447
- with gr.Row():
448
- output_dir = gr.Textbox(value="output", label="File output directory", info="The directory where output files will be saved.", placeholder="output")
449
- output_format = gr.Dropdown(value="wav", choices=["wav", "flac", "mp3"], label="Output Format", info="The format of the output audio file.")
450
- with gr.Row():
451
- norm_threshold = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=0.9, label="Normalization threshold", info="The threshold for audio normalization.")
452
- amp_threshold = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=0.6, label="Amplification threshold", info="The threshold for audio amplification.")
453
- with gr.Row():
454
- batch_size = gr.Slider(minimum=1, maximum=16, step=1, value=1, label="Batch Size", info="Larger consumes more RAM but may process slightly faster.")
455
- with gr.Tab("Credits"):
456
- gr.Markdown("""
457
- Politrees - gradio webui\n
458
- theNeodev - mod the ui\n
459
- nomadkaraoke - original project
460
- """)
461
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
462
 
463
- demucs_model.change(update_stems, inputs=[demucs_model], outputs=stem6)
464
-
465
- roformer_button.click(
466
- roformer_separator,
467
- inputs=[
468
- roformer_audio,
469
- roformer_model,
470
- roformer_seg_size,
471
- roformer_override_seg_size,
472
- roformer_overlap,
473
- roformer_pitch_shift,
474
- model_file_dir,
475
- output_dir,
476
- output_format,
477
- norm_threshold,
478
- amp_threshold,
479
- batch_size,
480
- ],
481
- outputs=[roformer_stem1, roformer_stem2],
482
- )
483
- mdx23c_button.click(
484
- mdx23c_separator,
485
- inputs=[
486
- mdx23c_audio,
487
- mdx23c_model,
488
- mdx23c_seg_size,
489
- mdx23c_override_seg_size,
490
- mdx23c_overlap,
491
- mdx23c_pitch_shift,
492
- model_file_dir,
493
- output_dir,
494
- output_format,
495
- norm_threshold,
496
- amp_threshold,
497
- batch_size,
498
- ],
499
- outputs=[mdx23c_stem1, mdx23c_stem2],
500
- )
501
- mdx_button.click(
502
- mdx_separator,
503
- inputs=[
504
- mdx_audio,
505
- mdx_model,
506
- mdx_hop_length,
507
- mdx_seg_size,
508
- mdx_overlap,
509
- mdx_denoise,
510
- model_file_dir,
511
- output_dir,
512
- output_format,
513
- norm_threshold,
514
- amp_threshold,
515
- batch_size,
516
- ],
517
- outputs=[mdx_stem1, mdx_stem2],
518
- )
519
- vr_button.click(
520
- vr_separator,
521
- inputs=[
522
- vr_audio,
523
- vr_model,
524
- vr_window_size,
525
- vr_aggression,
526
- vr_tta,
527
- vr_post_process,
528
- vr_post_process_threshold,
529
- vr_high_end_process,
530
- model_file_dir,
531
- output_dir,
532
- output_format,
533
- norm_threshold,
534
- amp_threshold,
535
- batch_size,
536
- ],
537
- outputs=[vr_stem1, vr_stem2],
538
- )
539
- demucs_button.click(
540
- demucs_separator,
541
- inputs=[
542
- demucs_audio,
543
- demucs_model,
544
- demucs_seg_size,
545
- demucs_shifts,
546
- demucs_overlap,
547
- demucs_segments_enabled,
548
- model_file_dir,
549
- output_dir,
550
- output_format,
551
- norm_threshold,
552
- amp_threshold,
553
- ],
554
- outputs=[demucs_stem1, demucs_stem2, demucs_stem3, demucs_stem4, demucs_stem5, demucs_stem6],
555
- )
556
 
557
- def main():
558
- app.launch(share=True, debug=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
559
 
560
- if __name__ == "__main__":
561
- main()
 
1
  import os
2
  import torch
 
3
  import logging
4
+ import yt_dlp
5
+ import spaces
6
  import gradio as gr
 
7
  from audio_separator.separator import Separator
8
 
9
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
12
  #=========================#
13
  # Roformer Models #
14
  #=========================#
15
+ roformer_models = {
 
 
 
16
  'BS-Roformer-Viperx-1297': 'model_bs_roformer_ep_317_sdr_12.9755.ckpt',
17
+ 'BS-Roformer-Viperx-1296': 'model_bs_roformer_ep_368_sdr_12.9628.ckpt',
18
+ 'BS-Roformer-Viperx-1053': 'model_bs_roformer_ep_937_sdr_10.5309.ckpt',
19
+ 'Mel-Roformer-Viperx-1143': 'model_mel_band_roformer_ep_3005_sdr_11.4360.ckpt',
20
+ 'BS-Roformer-De-Reverb': 'deverb_bs_roformer_8_384dim_10depth.ckpt',
21
  'Mel-Roformer-Crowd-Aufr33-Viperx': 'mel_band_roformer_crowd_aufr33_viperx_sdr_8.7144.ckpt',
22
  'Mel-Roformer-Denoise-Aufr33': 'denoise_mel_band_roformer_aufr33_sdr_27.9959.ckpt',
23
+ 'Mel-Roformer-Denoise-Aufr33-Aggr' : 'denoise_mel_band_roformer_aufr33_aggr_sdr_27.9768.ckpt',
24
  'Mel-Roformer-Karaoke-Aufr33-Viperx': 'mel_band_roformer_karaoke_aufr33_viperx_sdr_10.1956.ckpt',
25
+ 'MelBand Roformer Kim | Inst V1 by Unwa' : 'melband_roformer_inst_v1.ckpt',
26
+ 'MelBand Roformer Kim | Inst V2 by Unwa' : 'melband_roformer_inst_v2.ckpt',
27
+ 'MelBand Roformer Kim | InstVoc Duality V1 by Unwa' : 'melband_roformer_instvoc_duality_v1.ckpt',
28
+ 'MelBand Roformer Kim | InstVoc Duality V2 by Unwa' : 'melband_roformer_instvox_duality_v2.ckpt',
 
 
 
 
 
29
  }
30
+
31
  #=========================#
32
  # MDX23C Models #
33
  #=========================#
34
+ mdx23c_models = [
35
+ 'MDX23C_D1581.ckpt',
36
  'MDX23C-8KFFT-InstVoc_HQ.ckpt',
37
  'MDX23C-8KFFT-InstVoc_HQ_2.ckpt',
 
38
  ]
39
+
40
  #=========================#
41
  # MDXN-NET Models #
42
  #=========================#
43
+ mdxnet_models = [
 
 
 
 
 
 
 
 
 
44
  'UVR-MDX-NET-Inst_full_292.onnx',
45
+ 'UVR-MDX-NET_Inst_187_beta.onnx',
46
  'UVR-MDX-NET_Inst_82_beta.onnx',
47
  'UVR-MDX-NET_Inst_90_beta.onnx',
 
48
  'UVR-MDX-NET_Main_340.onnx',
49
  'UVR-MDX-NET_Main_390.onnx',
50
  'UVR-MDX-NET_Main_406.onnx',
51
  'UVR-MDX-NET_Main_427.onnx',
52
  'UVR-MDX-NET_Main_438.onnx',
53
+ 'UVR-MDX-NET-Inst_HQ_1.onnx',
54
+ 'UVR-MDX-NET-Inst_HQ_2.onnx',
55
+ 'UVR-MDX-NET-Inst_HQ_3.onnx',
56
+ 'UVR-MDX-NET-Inst_HQ_4.onnx',
57
+ 'UVR-MDX-NET-Inst_HQ_5.onnx',
58
+ 'UVR_MDXNET_Main.onnx',
59
+ 'UVR-MDX-NET-Inst_Main.onnx',
60
  'UVR_MDXNET_1_9703.onnx',
61
  'UVR_MDXNET_2_9682.onnx',
62
  'UVR_MDXNET_3_9662.onnx',
63
+ 'UVR-MDX-NET-Inst_1.onnx',
64
+ 'UVR-MDX-NET-Inst_2.onnx',
65
+ 'UVR-MDX-NET-Inst_3.onnx',
66
  'UVR_MDXNET_KARA.onnx',
67
  'UVR_MDXNET_KARA_2.onnx',
68
+ 'UVR_MDXNET_9482.onnx',
69
+ 'UVR-MDX-NET-Voc_FT.onnx',
70
+ 'Kim_Vocal_1.onnx',
71
+ 'Kim_Vocal_2.onnx',
72
+ 'Kim_Inst.onnx',
73
+ 'Reverb_HQ_By_FoxJoy.onnx',
74
+ 'UVR-MDX-NET_Crowd_HQ_1.onnx',
75
+ 'kuielab_a_vocals.onnx',
76
+ 'kuielab_a_other.onnx',
77
  'kuielab_a_bass.onnx',
78
  'kuielab_a_drums.onnx',
79
+ 'kuielab_b_vocals.onnx',
80
+ 'kuielab_b_other.onnx',
81
  'kuielab_b_bass.onnx',
82
  'kuielab_b_drums.onnx',
 
 
 
 
 
 
83
  ]
84
+
85
  #========================#
86
  # VR-ARCH Models #
87
  #========================#
88
+ vrarch_models = [
89
  '1_HP-UVR.pth',
90
  '2_HP-UVR.pth',
91
  '3_HP-Vocal-UVR.pth',
 
103
  '15_SP-UVR-MID-44100-1.pth',
104
  '16_SP-UVR-MID-44100-2.pth',
105
  '17_HP-Wind_Inst-UVR.pth',
 
 
 
 
 
 
106
  'UVR-De-Echo-Aggressive.pth',
107
  'UVR-De-Echo-Normal.pth',
108
+ 'UVR-DeEcho-DeReverb.pth',
109
  'UVR-DeNoise-Lite.pth',
110
  'UVR-DeNoise.pth',
111
+ 'UVR-BVE-4B_SN-44100-1.pth',
112
+ 'MGM_HIGHEND_v4.pth',
113
+ 'MGM_LOWEND_A_v4.pth',
114
+ 'MGM_LOWEND_B_v4.pth',
115
+ 'MGM_MAIN_v4.pth',
116
  ]
117
+
118
  #=======================#
119
  # DEMUCS Models #
120
  #=======================#
121
+ demucs_models = [
 
 
 
122
  'htdemucs_ft.yaml',
123
+ 'htdemucs_6s.yaml',
124
+ 'htdemucs.yaml',
125
+ 'hdemucs_mmi.yaml',
126
+ ]
127
+
128
+ output_format = [
129
+ 'wav',
130
+ 'flac',
131
+ 'mp3',
132
+ 'ogg',
133
+ 'opus',
134
+ 'm4a',
135
+ 'aiff',
136
+ 'ac3'
137
  ]
138
 
139
+ found_files = []
140
+ logs = []
141
+ out_dir = "./outputs"
142
+ models_dir = "./models"
143
+ extensions = (".wav", ".flac", ".mp3", ".ogg", ".opus", ".m4a", ".aiff", ".ac3")
144
+
145
+ def download_audio(url, output_dir="ytdl"):
146
+
147
+ os.makedirs(output_dir, exist_ok=True)
148
+
149
+ ydl_opts = {
150
+ 'format': 'bestaudio/best',
151
+ 'postprocessors': [{
152
+ 'key': 'FFmpegExtractAudio',
153
+ 'preferredcodec': 'wav',
154
+ 'preferredquality': '32',
155
+ }],
156
+ 'outtmpl': os.path.join(output_dir, '%(title)s.%(ext)s'),
157
+ 'postprocessor_args': [
158
+ '-acodec', 'pcm_f32le'
159
+ ],
160
+ }
161
+
162
  try:
163
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
164
+ info = ydl.extract_info(url, download=False)
165
+ video_title = info['title']
166
+
167
+ ydl.download([url])
168
+
169
+ file_path = os.path.join(output_dir, f"{video_title}.wav")
170
+
171
+ if os.path.exists(file_path):
172
+ return os.path.abspath(file_path)
173
+ else:
174
+ raise Exception("Something went wrong")
175
+
176
  except Exception as e:
177
+ raise Exception(f"Error extracting audio with yt-dlp: {str(e)}")
 
178
 
179
+ @spaces.GPU(duration=60)
180
+ def roformer_separator(audio, model_key, out_format, segment_size, override_seg_size, overlap, batch_size, norm_thresh, amp_thresh, progress=gr.Progress(track_tqdm=True)):
181
  base_name = os.path.splitext(os.path.basename(audio))[0]
182
+ roformer_model = roformer_models[model_key]
 
183
  try:
 
184
  separator = Separator(
185
  log_level=logging.WARNING,
186
+ model_file_dir=models_dir,
187
  output_dir=out_dir,
188
  output_format=out_format,
189
+ use_autocast=use_autocast,
190
  normalization_threshold=norm_thresh,
191
  amplification_threshold=amp_thresh,
 
192
  mdxc_params={
193
+ "segment_size": segment_size,
194
  "override_model_segment_size": override_seg_size,
195
  "batch_size": batch_size,
196
  "overlap": overlap,
 
197
  }
198
  )
199
+
200
+ progress(0.2, desc="Loading model...")
201
+ separator.load_model(model_filename=roformer_model)
202
 
203
+ progress(0.7, desc="Separating audio...")
 
 
 
204
  separation = separator.separate(audio, f"{base_name}_(Stem1)", f"{base_name}_(Stem2)")
 
205
 
206
  stems = [os.path.join(out_dir, file_name) for file_name in separation]
207
  return stems[1], stems[0]
208
  except Exception as e:
209
  raise RuntimeError(f"Roformer separation failed: {e}") from e
210
 
211
+ @spaces.GPU(duration=60)
212
+ def mdxc_separator(audio, model, out_format, segment_size, override_seg_size, overlap, batch_size, norm_thresh, amp_thresh, progress=gr.Progress(track_tqdm=True)):
213
  base_name = os.path.splitext(os.path.basename(audio))[0]
 
214
  try:
 
215
  separator = Separator(
216
  log_level=logging.WARNING,
217
+ model_file_dir=models_dir,
218
  output_dir=out_dir,
219
  output_format=out_format,
220
+ use_autocast=use_autocast,
221
  normalization_threshold=norm_thresh,
222
  amplification_threshold=amp_thresh,
 
223
  mdxc_params={
224
+ "segment_size": segment_size,
225
  "override_model_segment_size": override_seg_size,
226
  "batch_size": batch_size,
227
  "overlap": overlap,
 
228
  }
229
  )
230
 
231
+ progress(0.2, desc="Loading model...")
232
  separator.load_model(model_filename=model)
233
 
234
+ progress(0.7, desc="Separating audio...")
235
  separation = separator.separate(audio, f"{base_name}_(Stem1)", f"{base_name}_(Stem2)")
 
236
 
237
  stems = [os.path.join(out_dir, file_name) for file_name in separation]
238
  return stems[1], stems[0]
239
  except Exception as e:
240
  raise RuntimeError(f"MDX23C separation failed: {e}") from e
241
 
242
+ @spaces.GPU(duration=60)
243
+ def mdxnet_separator(audio, model, out_format, hop_length, segment_size, denoise, overlap, batch_size, norm_thresh, amp_thresh, progress=gr.Progress(track_tqdm=True)):
244
  base_name = os.path.splitext(os.path.basename(audio))[0]
 
245
  try:
 
246
  separator = Separator(
247
  log_level=logging.WARNING,
248
+ model_file_dir=models_dir,
249
  output_dir=out_dir,
250
  output_format=out_format,
251
+ use_autocast=use_autocast,
252
  normalization_threshold=norm_thresh,
253
  amplification_threshold=amp_thresh,
 
254
  mdx_params={
255
  "hop_length": hop_length,
256
+ "segment_size": segment_size,
257
  "overlap": overlap,
258
  "batch_size": batch_size,
259
  "enable_denoise": denoise,
260
  }
261
  )
262
 
263
+ progress(0.2, desc="Loading model...")
264
  separator.load_model(model_filename=model)
265
 
266
+ progress(0.7, desc="Separating audio...")
267
  separation = separator.separate(audio, f"{base_name}_(Stem1)", f"{base_name}_(Stem2)")
 
268
 
269
  stems = [os.path.join(out_dir, file_name) for file_name in separation]
270
  return stems[0], stems[1]
271
  except Exception as e:
272
  raise RuntimeError(f"MDX-NET separation failed: {e}") from e
273
 
274
+ @spaces.GPU(duration=60)
275
+ def vrarch_separator(audio, model, out_format, window_size, aggression, tta, post_process, post_process_threshold, high_end_process, batch_size, norm_thresh, amp_thresh, progress=gr.Progress(track_tqdm=True)):
276
  base_name = os.path.splitext(os.path.basename(audio))[0]
 
277
  try:
 
278
  separator = Separator(
279
  log_level=logging.WARNING,
280
+ model_file_dir=models_dir,
281
  output_dir=out_dir,
282
  output_format=out_format,
283
+ use_autocast=use_autocast,
284
  normalization_threshold=norm_thresh,
285
  amplification_threshold=amp_thresh,
 
286
  vr_params={
287
  "batch_size": batch_size,
288
  "window_size": window_size,
 
294
  }
295
  )
296
 
297
+ progress(0.2, desc="Loading model...")
298
  separator.load_model(model_filename=model)
299
 
300
+ progress(0.7, desc="Separating audio...")
301
  separation = separator.separate(audio, f"{base_name}_(Stem1)", f"{base_name}_(Stem2)")
 
302
 
303
  stems = [os.path.join(out_dir, file_name) for file_name in separation]
304
  return stems[0], stems[1]
305
  except Exception as e:
306
  raise RuntimeError(f"VR ARCH separation failed: {e}") from e
307
 
308
+ @spaces.GPU(duration=60)
309
+ def demucs_separator(audio, model, out_format, shifts, segment_size, segments_enabled, overlap, batch_size, norm_thresh, amp_thresh, progress=gr.Progress(track_tqdm=True)):
310
+ base_name = os.path.splitext(os.path.basename(audio))[0]
311
  try:
 
312
  separator = Separator(
313
  log_level=logging.WARNING,
314
+ model_file_dir=models_dir,
315
  output_dir=out_dir,
316
  output_format=out_format,
317
+ use_autocast=use_autocast,
318
  normalization_threshold=norm_thresh,
319
  amplification_threshold=amp_thresh,
 
320
  demucs_params={
321
+ "batch_size": batch_size,
322
+ "segment_size": segment_size,
323
  "shifts": shifts,
324
  "overlap": overlap,
325
  "segments_enabled": segments_enabled,
326
  }
327
  )
328
 
329
+ progress(0.2, desc="Loading model...")
330
  separator.load_model(model_filename=model)
331
 
332
+ progress(0.7, desc="Separating audio...")
333
  separation = separator.separate(audio)
 
334
 
335
  stems = [os.path.join(out_dir, file_name) for file_name in separation]
336
 
 
347
  else:
348
  return gr.update(visible=False)
349
 
350
+ @spaces.GPU(duration=60)
351
+ def roformer_batch(path_input, path_output, model_key, out_format, segment_size, override_seg_size, overlap, batch_size, norm_thresh, amp_thresh):
352
+ found_files.clear()
353
+ logs.clear()
354
+ roformer_model = roformer_models[model_key]
355
 
356
+ for audio_files in os.listdir(path_input):
357
+ if audio_files.endswith(extensions):
358
+ found_files.append(audio_files)
359
+ total_files = len(found_files)
360
 
361
+ if total_files == 0:
362
+ logs.append("No valid audio files.")
363
+ yield "\n".join(logs)
364
+ else:
365
+ logs.append(f"{total_files} audio files found")
366
+ found_files.sort()
367
 
368
+ for audio_files in found_files:
369
+ file_path = os.path.join(path_input, audio_files)
370
+ base_name = os.path.splitext(os.path.basename(file_path))[0]
371
+ try:
372
+ separator = Separator(
373
+ log_level=logging.WARNING,
374
+ model_file_dir=models_dir,
375
+ output_dir=path_output,
376
+ output_format=out_format,
377
+ use_autocast=use_autocast,
378
+ normalization_threshold=norm_thresh,
379
+ amplification_threshold=amp_thresh,
380
+ mdxc_params={
381
+ "segment_size": segment_size,
382
+ "override_model_segment_size": override_seg_size,
383
+ "batch_size": batch_size,
384
+ "overlap": overlap,
385
+ }
386
+ )
387
 
388
+ logs.append("Loading model...")
389
+ yield "\n".join(logs)
390
+ separator.load_model(model_filename=roformer_model)
391
 
392
+ logs.append(f"Separating file: {audio_files}")
393
+ yield "\n".join(logs)
394
+ separator.separate(file_path, f"{base_name}_(Stem1)", f"{base_name}_(Stem2)")
395
+ logs.append(f"File: {audio_files} separated!")
396
+ yield "\n".join(logs)
397
+ except Exception as e:
398
+ raise RuntimeError(f"Roformer batch separation failed: {e}") from e
399
 
400
+ @spaces.GPU(duration=60)
401
+ def mdx23c_batch(path_input, path_output, model, out_format, segment_size, override_seg_size, overlap, batch_size, norm_thresh, amp_thresh):
402
+ found_files.clear()
403
+ logs.clear()
404
 
405
+ for audio_files in os.listdir(path_input):
406
+ if audio_files.endswith(extensions):
407
+ found_files.append(audio_files)
408
+ total_files = len(found_files)
409
+
410
+ if total_files == 0:
411
+ logs.append("No valid audio files.")
412
+ yield "\n".join(logs)
413
+ else:
414
+ logs.append(f"{total_files} audio files found")
415
+ found_files.sort()
416
+
417
+ for audio_files in found_files:
418
+ file_path = os.path.join(path_input, audio_files)
419
+ base_name = os.path.splitext(os.path.basename(file_path))[0]
420
+ try:
421
+ separator = Separator(
422
+ log_level=logging.WARNING,
423
+ model_file_dir=models_dir,
424
+ output_dir=path_output,
425
+ output_format=out_format,
426
+ use_autocast=use_autocast,
427
+ normalization_threshold=norm_thresh,
428
+ amplification_threshold=amp_thresh,
429
+ mdxc_params={
430
+ "segment_size": segment_size,
431
+ "override_model_segment_size": override_seg_size,
432
+ "batch_size": batch_size,
433
+ "overlap": overlap,
434
+ }
435
+ )
436
+
437
+ logs.append("Loading model...")
438
+ yield "\n".join(logs)
439
+ separator.load_model(model_filename=model)
440
+
441
+ logs.append(f"Separating file: {audio_files}")
442
+ yield "\n".join(logs)
443
+ separator.separate(file_path, f"{base_name}_(Stem1)", f"{base_name}_(Stem2)")
444
+ logs.append(f"File: {audio_files} separated!")
445
+ yield "\n".join(logs)
446
+ except Exception as e:
447
+ raise RuntimeError(f"Roformer batch separation failed: {e}") from e
448
+
449
+ @spaces.GPU(duration=60)
450
+ def mdxnet_batch(path_input, path_output, model, out_format, hop_length, segment_size, denoise, overlap, batch_size, norm_thresh, amp_thresh):
451
+ found_files.clear()
452
+ logs.clear()
453
+
454
+ for audio_files in os.listdir(path_input):
455
+ if audio_files.endswith(extensions):
456
+ found_files.append(audio_files)
457
+ total_files = len(found_files)
458
+
459
+ if total_files == 0:
460
+ logs.append("No valid audio files.")
461
+ yield "\n".join(logs)
462
+ else:
463
+ logs.append(f"{total_files} audio files found")
464
+ found_files.sort()
465
+
466
+ for audio_files in found_files:
467
+ file_path = os.path.join(path_input, audio_files)
468
+ base_name = os.path.splitext(os.path.basename(file_path))[0]
469
+ try:
470
+ separator = Separator(
471
+ log_level=logging.WARNING,
472
+ model_file_dir=models_dir,
473
+ output_dir=path_output,
474
+ output_format=out_format,
475
+ use_autocast=use_autocast,
476
+ normalization_threshold=norm_thresh,
477
+ amplification_threshold=amp_thresh,
478
+ mdx_params={
479
+ "hop_length": hop_length,
480
+ "segment_size": segment_size,
481
+ "overlap": overlap,
482
+ "batch_size": batch_size,
483
+ "enable_denoise": denoise,
484
+ }
485
+ )
486
+
487
+ logs.append("Loading model...")
488
+ yield "\n".join(logs)
489
+ separator.load_model(model_filename=model)
490
+
491
+ logs.append(f"Separating file: {audio_files}")
492
+ yield "\n".join(logs)
493
+ separator.separate(file_path, f"{base_name}_(Stem1)", f"{base_name}_(Stem2)")
494
+ logs.append(f"File: {audio_files} separated!")
495
+ yield "\n".join(logs)
496
+ except Exception as e:
497
+ raise RuntimeError(f"Roformer batch separation failed: {e}") from e
498
+
499
+ @spaces.GPU(duration=60)
500
+ def vrarch_batch(path_input, path_output, model, out_format, window_size, aggression, tta, post_process, post_process_threshold, high_end_process, batch_size, norm_thresh, amp_thresh):
501
+ found_files.clear()
502
+ logs.clear()
503
+
504
+ for audio_files in os.listdir(path_input):
505
+ if audio_files.endswith(extensions):
506
+ found_files.append(audio_files)
507
+ total_files = len(found_files)
508
+
509
+ if total_files == 0:
510
+ logs.append("No valid audio files.")
511
+ yield "\n".join(logs)
512
+ else:
513
+ logs.append(f"{total_files} audio files found")
514
+ found_files.sort()
515
+
516
+ for audio_files in found_files:
517
+ file_path = os.path.join(path_input, audio_files)
518
+ base_name = os.path.splitext(os.path.basename(file_path))[0]
519
+ try:
520
+ separator = Separator(
521
+ log_level=logging.WARNING,
522
+ model_file_dir=models_dir,
523
+ output_dir=path_output,
524
+ output_format=out_format,
525
+ use_autocast=use_autocast,
526
+ normalization_threshold=norm_thresh,
527
+ amplification_threshold=amp_thresh,
528
+ vr_params={
529
+ "batch_size": batch_size,
530
+ "window_size": window_size,
531
+ "aggression": aggression,
532
+ "enable_tta": tta,
533
+ "enable_post_process": post_process,
534
+ "post_process_threshold": post_process_threshold,
535
+ "high_end_process": high_end_process,
536
+ }
537
+ )
538
+
539
+ logs.append("Loading model...")
540
+ yield "\n".join(logs)
541
+ separator.load_model(model_filename=model)
542
+
543
+ logs.append(f"Separating file: {audio_files}")
544
+ yield "\n".join(logs)
545
+ separator.separate(file_path, f"{base_name}_(Stem1)", f"{base_name}_(Stem2)")
546
+ logs.append(f"File: {audio_files} separated!")
547
+ yield "\n".join(logs)
548
+ except Exception as e:
549
+ raise RuntimeError(f"Roformer batch separation failed: {e}") from e
550
+
551
+ @spaces.GPU(duration=60)
552
+ def demucs_batch(path_input, path_output, model, out_format, shifts, segment_size, segments_enabled, overlap, batch_size, norm_thresh, amp_thresh):
553
+ found_files.clear()
554
+ logs.clear()
555
+
556
+ for audio_files in os.listdir(path_input):
557
+ if audio_files.endswith(extensions):
558
+ found_files.append(audio_files)
559
+ total_files = len(found_files)
560
+
561
+ if total_files == 0:
562
+ logs.append("No valid audio files.")
563
+ yield "\n".join(logs)
564
+ else:
565
+ logs.append(f"{total_files} audio files found")
566
+ found_files.sort()
567
+
568
+ for audio_files in found_files:
569
+ file_path = os.path.join(path_input, audio_files)
570
+ try:
571
+ separator = Separator(
572
+ log_level=logging.WARNING,
573
+ model_file_dir=models_dir,
574
+ output_dir=path_output,
575
+ output_format=out_format,
576
+ use_autocast=use_autocast,
577
+ normalization_threshold=norm_thresh,
578
+ amplification_threshold=amp_thresh,
579
+ demucs_params={
580
+ "batch_size": batch_size,
581
+ "segment_size": segment_size,
582
+ "shifts": shifts,
583
+ "overlap": overlap,
584
+ "segments_enabled": segments_enabled,
585
+ }
586
+ )
587
+
588
+ logs.append("Loading model...")
589
+ yield "\n".join(logs)
590
+ separator.load_model(model_filename=model)
591
+
592
+ logs.append(f"Separating file: {audio_files}")
593
+ yield "\n".join(logs)
594
+ separator.separate(file_path)
595
+ logs.append(f"File: {audio_files} separated!")
596
+ yield "\n".join(logs)
597
+ except Exception as e:
598
+ raise RuntimeError(f"Roformer batch separation failed: {e}") from e
599
+
600
+ with gr.Blocks(theme ="hev832/applio", title = "🎵 Audio Separator UI 🎵") as app:
601
+ with gr.Row():
602
+ gr.Markdown("<h1> 🎵 Audio Separator UI 🎵 </h1>")
603
+ with gr.Row():
604
+ with gr.Tabs():
605
+ with gr.TabItem("BS/Mel Roformer"):
606
  with gr.Row():
607
+ roformer_model = gr.Dropdown(
608
+ label = "Select the model",
609
+ choices = list(roformer_models.keys()),
610
+ value = lambda : None,
611
+ interactive = True
612
+ )
613
+ roformer_output_format = gr.Dropdown(
614
+ label = "Select the output format",
615
+ choices = output_format,
616
+ value = lambda : None,
617
+ interactive = True
618
+ )
619
+ with gr.Accordion("Advanced settings"), open = False):
620
+ with gr.Group():
621
+ with gr.Row():
622
+ roformer_segment_size = gr.Slider(
623
+ label = "Segment size",
624
+ info = "Larger consumes more resources, but may give better results",
625
+ minimum = 32,
626
+ maximum = 4000,
627
+ step = 32,
628
+ value = 256,
629
+ interactive = True
630
+ )
631
+ roformer_override_segment_size = gr.Checkbox(
632
+ label = "Override segment size",
633
+ info = "Override model default segment size instead of using the model default value",
634
+ value = False,
635
+ interactive = True
636
+ )
637
+ with gr.Row():
638
+ roformer_overlap = gr.Slider(
639
+ label = "Overlap",
640
+ info = "Amount of overlap between prediction windows",
641
+ minimum = 2,
642
+ maximum = 10,
643
+ step = 1,
644
+ value = 8,
645
+ interactive = True
646
+ )
647
+ roformer_batch_size = gr.Slider(
648
+ label = "Batch size",
649
+ info = "Larger consumes more RAM but may process slightly faster",
650
+ minimum = 1,
651
+ maximum = 16,
652
+ step = 1,
653
+ value = 1,
654
+ interactive = True
655
+ )
656
+ with gr.Row():
657
+ roformer_normalization_threshold = gr.Slider(
658
+ label = "Normalization threshold",
659
+ info = "The threshold for audio normalization",
660
+ minimum = 0.1,
661
+ maximum = 1,
662
+ step = 0.1,
663
+ value = 0.1,
664
+ interactive = True
665
+ )
666
+ roformer_amplification_threshold = gr.Slider(
667
+ label = "Amplification threshold",
668
+ info = "The threshold for audio amplification",
669
+ minimum = 0.1,
670
+ maximum = 1,
671
+ step = 0.1,
672
+ value = 0.1,
673
+ interactive = True
674
+ )
675
  with gr.Row():
676
+ roformer_audio = gr.Audio(
677
+ label = "Input audio",
678
+ type = "filepath",
679
+ interactive = True
680
+ )
681
+ with gr.Accordion("Separation by link", open = False):
682
+ with gr.Row():
683
+ roformer_link = gr.Textbox(
684
+ label = _("Link"),
685
+ placeholder = "Paste the link here",
686
+ interactive = True
687
+ )
688
+ with gr.Row():
689
+ gr.Markdown("You can paste the link to the video/audio from many sites, check the complete list [here](https://github.com/yt-dlp/yt-dlp/blob/master/supportedsites.md)")
690
+ with gr.Row():
691
+ roformer_download_button = gr.Button(
692
+ "Download!",
693
+ variant = "primary"
694
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
695
 
696
+ roformer_download_button.click(download_audio, [roformer_link], [roformer_audio])
697
+
698
+
699
+
700
+
701
+ with gr.Row():
702
+ roformer_button = gr.Button(_("Separate!"), variant = "primary")
703
+ with gr.Row():
704
+ roformer_stem1 = gr.Audio(
705
+ show_download_button = True,
706
+ interactive = False,
707
+ label = _("Stem 1"),
708
+ type = "filepath"
709
+ )
710
+ roformer_stem2 = gr.Audio(
711
+ show_download_button = True,
712
+ interactive = False,
713
+ label = _("Stem 2"),
714
+ type = "filepath"
715
+ )
716
+
717
+ roformer_button.click(roformer_separator, [roformer_audio, roformer_model, roformer_output_format, roformer_segment_size, roformer_override_segment_size, roformer_overlap, roformer_batch_size, roformer_normalization_threshold, roformer_amplification_threshold], [roformer_stem1, roformer_stem2])
718
+
719
+ with gr.TabItem("MDX23C"):
720
+ with gr.Row():
721
+ mdx23c_model = gr.Dropdown(
722
+ label = _("Select the model"),
723
+ choices = mdx23c_models,
724
+ value = lambda : None,
725
+ interactive = True
726
+ )
727
+ mdx23c_output_format = gr.Dropdown(
728
+ label = _("Select the output format"),
729
+ choices = output_format,
730
+ value = lambda : None,
731
+ interactive = True
732
+ )
733
+ with gr.Accordion(_("Advanced settings"), open = False):
734
+ with gr.Group():
735
+ with gr.Row():
736
+ mdx23c_segment_size = gr.Slider(
737
+ minimum = 32,
738
+ maximum = 4000,
739
+ step = 32,
740
+ label = _("Segment size"),
741
+ info = _("Larger consumes more resources, but may give better results"),
742
+ value = 256,
743
+ interactive = True
744
+ )
745
+ mdx23c_override_segment_size = gr.Checkbox(
746
+ label = _("Override segment size"),
747
+ info = _("Override model default segment size instead of using the model default value"),
748
+ value = False,
749
+ interactive = True
750
+ )
751
+ with gr.Row():
752
+ mdx23c_overlap = gr.Slider(
753
+ minimum = 2,
754
+ maximum = 50,
755
+ step = 1,
756
+ label = _("Overlap"),
757
+ info = _("Amount of overlap between prediction windows"),
758
+ value = 8,
759
+ interactive = True
760
+ )
761
+ mdx23c_batch_size = gr.Slider(
762
+ label = _("Batch size"),
763
+ info = _("Larger consumes more RAM but may process slightly faster"),
764
+ minimum = 1,
765
+ maximum = 16,
766
+ step = 1,
767
+ value = 1,
768
+ interactive = True
769
+ )
770
+ with gr.Row():
771
+ mdx23c_normalization_threshold = gr.Slider(
772
+ label = _("Normalization threshold"),
773
+ info = _("The threshold for audio normalization"),
774
+ minimum = 0.1,
775
+ maximum = 1,
776
+ step = 0.1,
777
+ value = 0.1,
778
+ interactive = True
779
+ )
780
+ mdx23c_amplification_threshold = gr.Slider(
781
+ label = _("Amplification threshold"),
782
+ info = _("The threshold for audio amplification"),
783
+ minimum = 0.1,
784
+ maximum = 1,
785
+ step = 0.1,
786
+ value = 0.1,
787
+ interactive = True
788
+ )
789
+ with gr.Row():
790
+ mdx23c_audio = gr.Audio(
791
+ label = _("Input audio"),
792
+ type = "filepath",
793
+ interactive = True
794
+ )
795
+ with gr.Accordion(_("Separation by link"), open = False):
796
+ with gr.Row():
797
+ mdx23c_link = gr.Textbox(
798
+ label = _("Link"),
799
+ placeholder = _("Paste the link here"),
800
+ interactive = True
801
+ )
802
+ with gr.Row():
803
+ gr.Markdown(_("You can paste the link to the video/audio from many sites, check the complete list [here](https://github.com/yt-dlp/yt-dlp/blob/master/supportedsites.md)"))
804
+ with gr.Row():
805
+ mdx23c_download_button = gr.Button(
806
+ _("Download!"),
807
+ variant = "primary"
808
+ )
809
+
810
+ mdx23c_download_button.click(download_audio, [mdx23c_link], [mdx23c_audio])
811
+
812
+
813
+ with gr.Row():
814
+ mdx23c_button = gr.Button(_("Separate!"), variant = "primary")
815
+ with gr.Row():
816
+ mdx23c_stem1 = gr.Audio(
817
+ show_download_button = True,
818
+ interactive = False,
819
+ label = _("Stem 1"),
820
+ type = "filepath"
821
+ )
822
+ mdx23c_stem2 = gr.Audio(
823
+ show_download_button = True,
824
+ interactive = False,
825
+ label = _("Stem 2"),
826
+ type = "filepath"
827
+ )
828
+
829
+ mdx23c_button.click(mdxc_separator, [mdx23c_audio, mdx23c_model, mdx23c_output_format, mdx23c_segment_size, mdx23c_override_segment_size, mdx23c_overlap, mdx23c_batch_size, mdx23c_normalization_threshold, mdx23c_amplification_threshold], [mdx23c_stem1, mdx23c_stem2])
830
+
831
+ with gr.TabItem("MDX-NET"):
832
+ with gr.Row():
833
+ mdxnet_model = gr.Dropdown(
834
+ label = _("Select the model"),
835
+ choices = mdxnet_models,
836
+ value = lambda : None,
837
+ interactive = True
838
+ )
839
+ mdxnet_output_format = gr.Dropdown(
840
+ label = _("Select the output format"),
841
+ choices = output_format,
842
+ value = lambda : None,
843
+ interactive = True
844
+ )
845
+ with gr.Accordion(_("Advanced settings"), open = False):
846
+ with gr.Group():
847
+ with gr.Row():
848
+ mdxnet_hop_length = gr.Slider(
849
+ label = _("Hop length"),
850
+ info = _("Usually called stride in neural networks; only change if you know what you're doing"),
851
+ minimum = 32,
852
+ maximum = 2048,
853
+ step = 32,
854
+ value = 1024,
855
+ interactive = True
856
+ )
857
+ mdxnet_segment_size = gr.Slider(
858
+ minimum = 32,
859
+ maximum = 4000,
860
+ step = 32,
861
+ label = _("Segment size"),
862
+ info = _("Larger consumes more resources, but may give better results"),
863
+ value = 256,
864
+ interactive = True
865
+ )
866
+ mdxnet_denoise = gr.Checkbox(
867
+ label = _("Denoise"),
868
+ info = _("Enable denoising during separation"),
869
+ value = True,
870
+ interactive = True
871
+ )
872
+ with gr.Row():
873
+ mdxnet_overlap = gr.Slider(
874
+ label = _("Overlap"),
875
+ info = _("Amount of overlap between prediction windows"),
876
+ minimum = 0.001,
877
+ maximum = 0.999,
878
+ step = 0.001,
879
+ value = 0.25,
880
+ interactive = True
881
+ )
882
+ mdxnet_batch_size = gr.Slider(
883
+ label = _("Batch size"),
884
+ info = _("Larger consumes more RAM but may process slightly faster"),
885
+ minimum = 1,
886
+ maximum = 16,
887
+ step = 1,
888
+ value = 1,
889
+ interactive = True
890
+ )
891
+ with gr.Row():
892
+ mdxnet_normalization_threshold = gr.Slider(
893
+ label = _("Normalization threshold"),
894
+ info = _("The threshold for audio normalization"),
895
+ minimum = 0.1,
896
+ maximum = 1,
897
+ step = 0.1,
898
+ value = 0.1,
899
+ interactive = True
900
+ )
901
+ mdxnet_amplification_threshold = gr.Slider(
902
+ label = _("Amplification threshold"),
903
+ info = _("The threshold for audio amplification"),
904
+ minimum = 0.1,
905
+ maximum = 1,
906
+ step = 0.1,
907
+ value = 0.1,
908
+ interactive = True
909
+ )
910
+ with gr.Row():
911
+ mdxnet_audio = gr.Audio(
912
+ label = _("Input audio"),
913
+ type = "filepath",
914
+ interactive = True
915
+ )
916
+ with gr.Accordion(_("Separation by link"), open = False):
917
+ with gr.Row():
918
+ mdxnet_link = gr.Textbox(
919
+ label = _("Link"),
920
+ placeholder = _("Paste the link here"),
921
+ interactive = True
922
+ )
923
+ with gr.Row():
924
+ gr.Markdown("You can paste the link to the video/audio from many sites, check the complete list [here](https://github.com/yt-dlp/yt-dlp/blob/master/supportedsites.md)")
925
+ with gr.Row():
926
+ mdxnet_download_button = gr.Button(
927
+ "Download!",
928
+ variant = "primary"
929
+ )
930
+
931
+ mdxnet_download_button.click(download_audio, [mdxnet_link], [mdxnet_audio])
932
+
933
+
934
+ with gr.Row():
935
+ mdxnet_button = gr.Button("Separate!", variant = "primary")
936
+ with gr.Row():
937
+ mdxnet_stem1 = gr.Audio(
938
+ show_download_button = True,
939
+ interactive = False,
940
+ label = "Stem 1",
941
+ type = "filepath"
942
+ )
943
+ mdxnet_stem2 = gr.Audio(
944
+ show_download_button = True,
945
+ interactive = False,
946
+ label = "Stem 2",
947
+ type = "filepath"
948
+ )
949
+
950
+ mdxnet_button.click(mdxnet_separator, [mdxnet_audio, mdxnet_model, mdxnet_output_format, mdxnet_hop_length, mdxnet_segment_size, mdxnet_denoise, mdxnet_overlap, mdxnet_batch_size, mdxnet_normalization_threshold, mdxnet_amplification_threshold], [mdxnet_stem1, mdxnet_stem2])
951
+
952
+ with gr.TabItem("VR ARCH"):
953
+ with gr.Row():
954
+ vrarch_model = gr.Dropdown(
955
+ label = "Select the model",
956
+ choices = vrarch_models,
957
+ value = lambda : None,
958
+ interactive = True
959
+ )
960
+ vrarch_output_format = gr.Dropdown(
961
+ label = "Select the output format",
962
+ choices = output_format,
963
+ value = lambda : None,
964
+ interactive = True
965
+ )
966
+ with gr.Accordion("Advanced settings", open = False):
967
+ with gr.Group():
968
+ with gr.Row():
969
+ vrarch_window_size = gr.Slider(
970
+ label = _("Window size"),
971
+ info = _("Balance quality and speed. 1024 = fast but lower, 320 = slower but better quality"),
972
+ minimum=320,
973
+ maximum=1024,
974
+ step=32,
975
+ value = 512,
976
+ interactive = True
977
+ )
978
+ vrarch_agression = gr.Slider(
979
+ minimum = 1,
980
+ maximum = 50,
981
+ step = 1,
982
+ label = "Agression",
983
+ info = "Intensity of primary stem extraction",
984
+ value = 5,
985
+ interactive = True
986
+ )
987
+ vrarch_tta = gr.Checkbox(
988
+ label = "TTA",
989
+ info = "Enable Test-Time-Augmentation; slow but improves quality",
990
+ value = True,
991
+ visible = True,
992
+ interactive = True
993
+ )
994
+ with gr.Row():
995
+ vrarch_post_process = gr.Checkbox(
996
+ label = "Post process",
997
+ info = "Identify leftover artifacts within vocal output; may improve separation for some songs",
998
+ value = False,
999
+ visible = True,
1000
+ interactive = True
1001
+ )
1002
+ vrarch_post_process_threshold = gr.Slider(
1003
+ label = "Post process threshold",
1004
+ info = "Threshold for post-processing",
1005
+ minimum = 0.1,
1006
+ maximum = 0.3,
1007
+ step = 0.1,
1008
+ value = 0.2,
1009
+ interactive = True
1010
+ )
1011
+ with gr.Row():
1012
+ vrarch_high_end_process = gr.Checkbox(
1013
+ label = "High end process",
1014
+ info = "Mirror the missing frequency range of the output",
1015
+ value = False,
1016
+ visible = True,
1017
+ interactive = True,
1018
+ )
1019
+ vrarch_batch_size = gr.Slider(
1020
+ label = "Batch size",
1021
+ info = "Larger consumes more RAM but may process slightly faster",
1022
+ minimum = 1,
1023
+ maximum = 16,
1024
+ step = 1,
1025
+ value = 1,
1026
+ interactive = True
1027
+ )
1028
+ with gr.Row():
1029
+ vrarch_normalization_threshold = gr.Slider(
1030
+ label = "Normalization threshold",
1031
+ info = "The threshold for audio normalization",
1032
+ minimum = 0.1,
1033
+ maximum = 1,
1034
+ step = 0.1,
1035
+ value = 0.1,
1036
+ interactive = True
1037
+ )
1038
+ vrarch_amplification_threshold = gr.Slider(
1039
+ label = "Amplification threshold",
1040
+ info = "The threshold for audio amplification",
1041
+ minimum = 0.1,
1042
+ maximum = 1,
1043
+ step = 0.1,
1044
+ value = 0.1,
1045
+ interactive = True
1046
+ )
1047
+ with gr.Row():
1048
+ vrarch_audio = gr.Audio(
1049
+ label = "Input audio",
1050
+ type = "filepath",
1051
+ interactive = True
1052
+ )
1053
+ with gr.Accordion("Separation by link"), open = False):
1054
+ with gr.Row():
1055
+ vrarch_link = gr.Textbox(
1056
+ label = "Link",
1057
+ placeholder = _("Paste the link here"),
1058
+ interactive = True
1059
+ )
1060
+ with gr.Row():
1061
+ gr.Markdown("You can paste the link to the video/audio from many sites, check the complete list [here](https://github.com/yt-dlp/yt-dlp/blob/master/supportedsites.md)")
1062
+ with gr.Row():
1063
+ vrarch_download_button = gr.Button(
1064
+ "Download!",
1065
+ variant = "primary"
1066
+ )
1067
+
1068
+ vrarch_download_button.click(download_audio, [vrarch_link], [vrarch_audio])
1069
 
1070
+
1071
+ with gr.Row():
1072
+ vrarch_button = gr.Button("Separate!", variant = "primary")
1073
+ with gr.Row():
1074
+ vrarch_stem1 = gr.Audio(
1075
+ show_download_button = True,
1076
+ interactive = False,
1077
+ type = "filepath",
1078
+ label = "Stem 1"
1079
+ )
1080
+ vrarch_stem2 = gr.Audio(
1081
+ show_download_button = True,
1082
+ interactive = False,
1083
+ type = "filepath",
1084
+ label = "Stem 2"
1085
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1086
 
1087
+ vrarch_button.click(vrarch_separator, [vrarch_audio, vrarch_model, vrarch_output_format, vrarch_window_size, vrarch_agression, vrarch_tta, vrarch_post_process, vrarch_post_process_threshold, vrarch_high_end_process, vrarch_batch_size, vrarch_normalization_threshold, vrarch_amplification_threshold], [vrarch_stem1, vrarch_stem2])
1088
+
1089
+ with gr.TabItem("Demucs"):
1090
+ with gr.Row():
1091
+ demucs_model = gr.Dropdown(
1092
+ label = "Select the model",
1093
+ choices = demucs_models,
1094
+ value = lambda : None,
1095
+ interactive = True
1096
+ )
1097
+ demucs_output_format = gr.Dropdown(
1098
+ label = "Select the output format",
1099
+ choices = output_format,
1100
+ value = lambda : None,
1101
+ interactive = True
1102
+ )
1103
+ with gr.Accordion("Advanced settings", open = False):
1104
+ with gr.Group():
1105
+ with gr.Row():
1106
+ demucs_shifts = gr.Slider(
1107
+ label = "Shifts",
1108
+ info = "Number of predictions with random shifts, higher = slower but better quality",
1109
+ minimum = 1,
1110
+ maximum = 20,
1111
+ step = 1,
1112
+ value = 2,
1113
+ interactive = True
1114
+ )
1115
+ demucs_segment_size = gr.Slider(
1116
+ label = "Segment size",
1117
+ info = "Size of segments into which the audio is split. Higher = slower but better quality",
1118
+ minimum = 1,
1119
+ maximum = 100,
1120
+ step = 1,
1121
+ value = 40,
1122
+ interactive = True
1123
+ )
1124
+ demucs_segments_enabled = gr.Checkbox(
1125
+ label = "Segment-wise processing",
1126
+ info = "Enable segment-wise processing",
1127
+ value = True,
1128
+ interactive = True
1129
+ )
1130
+ with gr.Row():
1131
+ demucs_overlap = gr.Slider(
1132
+ label = "Overlap",
1133
+ info = "Overlap between prediction windows. Higher = slower but better quality",
1134
+ minimum=0.001,
1135
+ maximum=0.999,
1136
+ step=0.001,
1137
+ value = 0.25,
1138
+ interactive = True
1139
+ )
1140
+ demucs_batch_size = gr.Slider(
1141
+ label = "Batch size",
1142
+ info = "Larger consumes more RAM but may process slightly faster",
1143
+ minimum = 1,
1144
+ maximum = 16,
1145
+ step = 1,
1146
+ value = 1,
1147
+ interactive = True
1148
+ )
1149
+ with gr.Row():
1150
+ demucs_normalization_threshold = gr.Slider(
1151
+ label = "Normalization threshold",
1152
+ info = "The threshold for audio normalization",
1153
+ minimum = 0.1,
1154
+ maximum = 1,
1155
+ step = 0.1,
1156
+ value = 0.1,
1157
+ interactive = True
1158
+ )
1159
+ demucs_amplification_threshold = gr.Slider(
1160
+ label = "Amplification threshold",
1161
+ info = "The threshold for audio amplification",
1162
+ minimum = 0.1,
1163
+ maximum = 1,
1164
+ step = 0.1,
1165
+ value = 0.1,
1166
+ interactive = True
1167
+ )
1168
+ with gr.Row():
1169
+ demucs_audio = gr.Audio(
1170
+ label = "Input audio",
1171
+ type = "filepath",
1172
+ interactive = True
1173
+ )
1174
+ with gr.Accordion("Separation by link", open = False):
1175
+ with gr.Row():
1176
+ demucs_link = gr.Textbox(
1177
+ label = "Link",
1178
+ placeholder = "Paste the link here",
1179
+ interactive = True
1180
+ )
1181
+ with gr.Row():
1182
+ gr.Markdown("You can paste the link to the video/audio from many sites, check the complete list [here](https://github.com/yt-dlp/yt-dlp/blob/master/supportedsites.md)")
1183
+ with gr.Row():
1184
+ demucs_download_button = gr.Button(
1185
+ "Download!",
1186
+ variant = "primary"
1187
+ )
1188
+
1189
+ demucs_download_button.click(download_audio, [demucs_link], [demucs_audio])
1190
+
1191
+
1192
+ with gr.Row():
1193
+ demucs_bath_button = gr.Button("Separate!", variant = "primary")
1194
+ with gr.Row():
1195
+ demucs_info = gr.Textbox(
1196
+ label = "Output information",
1197
+ interactive = False
1198
+ )
1199
+
1200
+
1201
+ with gr.Row():
1202
+ demucs_button = gr.Button("Separate!"), variant = "primary")
1203
+ with gr.Row():
1204
+ demucs_stem1 = gr.Audio(
1205
+ show_download_button = True,
1206
+ interactive = False,
1207
+ type = "filepath",
1208
+ label = "Stem 1"
1209
+ )
1210
+ demucs_stem2 = gr.Audio(
1211
+ show_download_button = True,
1212
+ interactive = False,
1213
+ type = "filepath",
1214
+ label = "Stem 2"
1215
+ )
1216
+ with gr.Row():
1217
+ demucs_stem3 = gr.Audio(
1218
+ show_download_button = True,
1219
+ interactive = False,
1220
+ type = "filepath",
1221
+ label = "Stem 3"
1222
+ )
1223
+ demucs_stem4 = gr.Audio(
1224
+ show_download_button = True,
1225
+ interactive = False,
1226
+ type = "filepath",
1227
+ label = "Stem 4"
1228
+ )
1229
+ with gr.Row(visible=False) as stem6:
1230
+ demucs_stem5 = gr.Audio(
1231
+ show_download_button = True,
1232
+ interactive = False,
1233
+ type = "filepath",
1234
+ label = "Stem 5"
1235
+ )
1236
+ demucs_stem6 = gr.Audio(
1237
+ show_download_button = True,
1238
+ interactive = False,
1239
+ type = "filepath",
1240
+ label = "Stem 6"
1241
+ )
1242
+
1243
+ demucs_model.change(update_stems, inputs=[demucs_model], outputs=stem6)
1244
+
1245
+ demucs_button.click(demucs_separator, [demucs_audio, demucs_model, demucs_output_format, demucs_shifts, demucs_segment_size, demucs_segments_enabled, demucs_overlap, demucs_batch_size, demucs_normalization_threshold, demucs_amplification_threshold], [demucs_stem1, demucs_stem2, demucs_stem3, demucs_stem4, demucs_stem5, demucs_stem6])
1246
+
1247
+
1248
+ with gr.TabItem("Credits"):
1249
+ gr.Markdown(
1250
+ """
1251
+ audio separator UI created by **[Eddycrack 864] & [_noxty](https://huggingface.co/theNeofr).
1252
+ * python-audio-separator by [beveradb](https://github.com/beveradb).
1253
+ * Special thanks to [Ilaria](https://github.com/TheStingerX) for hosting this space and help.
1254
+ * Thanks to [Mikus](https://github.com/cappuch) for the help with the code.
1255
+ * Thanks to [Nick088](https://huggingface.co/Nick088) for the help to fix roformers.
1256
+ * Thanks to [yt_dlp](https://github.com/yt-dlp/yt-dlp) devs.
1257
+ * Separation by link source code and improvements by [Blane187](https://huggingface.co/Blane187).
1258
+ * Thanks to [ArisDev](https://github.com/aris-py) for porting UVR5 UI to Kaggle and improvements.
1259
+ * Thanks to [Bebra777228](https://github.com/Bebra777228)'s code for guiding me to improve my code.
1260
+
1261
+
1262
+ You can donate to the original UVR5 project here:
1263
+ [!["Buy Me A Coffee"](https://www.buymeacoffee.com/assets/img/custom_images/orange_img.png)](https://www.buymeacoffee.com/uvr5)
1264
+ """
1265
+ )
1266
 
1267
+ app.queue()
1268
+ app.launch(share=True, debug=True)