Spaces:
Running
on
Zero
Running
on
Zero
File size: 7,000 Bytes
02c7bdf bdaf47a 02c7bdf 7b02833 02c7bdf a1655f3 02c7bdf e537531 e63a812 fef598a e63a812 a9e592e 132a2a9 e63a812 02c7bdf 3192961 e537531 b78b7d0 4554491 b78b7d0 4554491 b78b7d0 f8605aa b02e870 f8605aa ce131df d431bb0 f8605aa d431bb0 f8605aa 6656f98 f8605aa 3ee12a7 b02e870 f8605aa 3192961 3956066 b78b7d0 e805751 e63a812 fef598a e63a812 b78b7d0 e805751 b78b7d0 e63a812 6963e61 b78b7d0 ed890e5 b78b7d0 8bb6908 3956066 e805751 3956066 e805751 3956066 b78b7d0 e63a812 6963e61 3956066 6320c59 d04ae35 3956066 b78b7d0 f8605aa 7be4073 f8605aa d431bb0 f8605aa 56f7076 f8605aa b02e870 f8605aa 99710ec 6963e61 f8605aa 49effbd 0f8dddd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
import torch
import soundfile as sf
import gradio as gr
import spaces
from clearvoice import ClearVoice
import os
@spaces.GPU
def fn_clearvoice_se(input_wav, sr):
if sr == "16000":
myClearVoice = ClearVoice(task='speech_enhancement', model_names=['FRCRN_SE_16K'])
fs = 16000
else:
myClearVoice = ClearVoice(task='speech_enhancement', model_names=['MossFormer2_SE_48K'])
fs = 48000
output_wav_dict = myClearVoice(input_path=input_wav, online_write=False)
if isinstance(output_wav_dict, dict):
key = next(iter(output_wav_dict))
output_wav = output_wav_dict[key]
else:
output_wav = output_wav_dict
sf.write('enhanced.wav', output_wav, fs)
return 'enhanced.wav'
@spaces.GPU
def fn_clearvoice_ss(input_wav):
myClearVoice = ClearVoice(task='speech_separation', model_names=['MossFormer2_SS_16K'])
output_wav_dict = myClearVoice(input_path=input_wav, online_write=False)
if isinstance(output_wav_dict, dict):
key = next(iter(output_wav_dict))
output_wav_list = output_wav_dict[key]
output_wav_s1 = output_wav_list[0]
output_wav_s2 = output_wav_list[1]
else:
output_wav_list = output_wav_dict
output_wav_s1 = output_wav_list[0]
output_wav_s2 = output_wav_list[1]
sf.write('separated_s1.wav', output_wav_s1, 16000)
sf.write('separated_s2.wav', output_wav_s2, 16000)
return "separated_s1.wav", "separated_s2.wav"
def find_mp4_files(directory):
mp4_files = []
# Walk through the directory and its subdirectories
for root, dirs, files in os.walk(directory):
for file in files:
# Check if the file ends with .mp4
if file.endswith(".mp4") and file[:3] == 'est':
mp4_files.append(os.path.join(root, file))
return mp4_files
@spaces.GPU(duration=300)
def fn_clearvoice_tse(input_video):
myClearVoice = ClearVoice(task='target_speaker_extraction', model_names=['AV_MossFormer2_TSE_16K'])
#output_wav_dict =
print(f'input_video: {input_video}')
myClearVoice(input_path=input_video, online_write=True, output_path='path_to_output_videos_tse')
output_list = find_mp4_files('path_to_output_videos_tse/')
print(output_list)
return output_list
demo = gr.Blocks()
se_demo = gr.Interface(
fn=fn_clearvoice_se,
inputs = [
gr.Audio(label="Input Audio", type="filepath"),
gr.Dropdown(
["16000", "48000"], value=["16000"], multiselect=False, label="Sampling Rate", info="Choose the sampling rate for your output."
),
],
outputs = [
gr.Audio(label="Output Audio", type="filepath"),
],
title = "ClearVoice: Speech Enhancement",
description = ("Gradio demo for Speech enhancement with ClearVoice. The models support audios with 16 kHz (FRCRN backbone) and 48 kHz (MossFormer2 backbone) sampling rates. "
"We provide the generalized models trained on large scale of data for handling various of background environments. "
"To test it, simply upload your audio, or click one of the examples to load them. Read more at the links below."),
article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2206.07293' target='_blank'>FRCRN: Boosting Feature Representation Using Frequency Recurrence for Monaural Speech Enhancement</a> | <a href='https://github.com/alibabasglab/FRCRN' target='_blank'>Github Repo</a></p>"
),
examples = [
["examples/mandarin_speech_16kHz.wav", "16000"],
["examples/english_speech_48kHz.wav", "48000"],
],
cache_examples = True,
)
ss_demo = gr.Interface(
fn=fn_clearvoice_ss,
inputs = [
gr.Audio(label="Input Audio", type="filepath"),
],
outputs = [
gr.Audio(label="Output Audio", type="filepath"),
gr.Audio(label="Output Audio", type="filepath"),
],
title = "ClearVoice: Speech Separation",
description = ("Gradio demo for Speech separation with ClearVoice. The model (MossFormer2 backbone) supports 2 speakers' audio mixtures with 16 kHz sampling rate. "
"We provide the generalized models trained on large scale of data for handling independent speakers and various of background environments. "
"To test it, simply upload your audio, or click one of the examples to load them. Read more at the links below."),
article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2302.11824' target='_blank'>MossFormer: Pushing the Performance Limit of Monaural Speech Separation using Gated Single-Head Transformer with Convolution-Augmented Joint Self-Attentions</a> | <a href='https://github.com/alibabasglab/MossFormer' target='_blank'>Github Repo</a></p>"
"<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> | <a href='https://github.com/alibabasglab/MossFormer2' target='_blank'>Github Repo</a></p>"),
examples = [
['examples/female_female_speech.wav'],
['examples/female_male_speech.wav'],
],
cache_examples = True,
)
tse_demo = gr.Interface(
fn=fn_clearvoice_tse,
inputs = [
gr.Video(label="Input Video"),
],
outputs = [
gr.Gallery(label="Output Video List")
],
title = "ClearVoice: Audio-visual speaker extraction",
description = ("Gradio demo for audio-visual speaker extraction with ClearVoice. The model (AV_MossFormer2_TSE_16K) supports 16 kHz sampling rate. "
"We provide the generalized models trained on mid-scale of data for handling independent speakers and various of background environments. "
"To test it, simply upload your video, or click one of the examples to load them. Read more at the links below."),
article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2302.11824' target='_blank'>MossFormer: Pushing the Performance Limit of Monaural Speech Separation using Gated Single-Head Transformer with Convolution-Augmented Joint Self-Attentions</a> | <a href='https://github.com/alibabasglab/MossFormer' target='_blank'>Github Repo</a></p>"
"<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> | <a href='https://github.com/alibabasglab/MossFormer2' target='_blank'>Github Repo</a></p>"),
examples = [
['examples/female_female_speech.wav'],
['examples/female_male_speech.wav'],
],
cache_examples = True,
)
with demo:
#gr.TabbedInterface([se_demo], ["Speech Enhancement"])
gr.TabbedInterface([se_demo, ss_demo, tse_demo], ["Speech Enhancement", "Speech Separation", "Target Speaker Extraction"])
demo.launch() |