Spaces:

alibabasglab
/

ClearVoice

Running on Zero

File size: 7,000 Bytes

02c7bdf
bdaf47a
02c7bdf
7b02833
02c7bdf
a1655f3
02c7bdf
e537531
e63a812
fef598a
e63a812
 
 
 
 
a9e592e
132a2a9
 
 
 
 
e63a812
02c7bdf
3192961
e537531
b78b7d0
 
 
 
 
4554491
 
 
b78b7d0
4554491
 
 
 
 
 
b78b7d0
f8605aa
 
 
 
 
 
 
b02e870
f8605aa
 
 
 
ce131df
d431bb0
f8605aa
 
d431bb0
 
f8605aa
6656f98
f8605aa
3ee12a7
b02e870
f8605aa
3192961
 
3956066
b78b7d0
 
e805751
e63a812
fef598a
e63a812
b78b7d0
 
e805751
b78b7d0
 
e63a812
 
 
6963e61
 
b78b7d0
ed890e5
 
b78b7d0
 
 
 
 
8bb6908
3956066
e805751
3956066
 
e805751
 
3956066
b78b7d0
e63a812
 
 
6963e61
 
3956066
6320c59
 
d04ae35
3956066
 
b78b7d0
f8605aa
7be4073
f8605aa
d431bb0
f8605aa
 
56f7076
f8605aa
b02e870
 
 
 
f8605aa
 
 
 
 
 
 
 
 
99710ec
6963e61
f8605aa
49effbd
0f8dddd

import torch
import soundfile as sf
import gradio as gr
import spaces
from clearvoice import ClearVoice
import os

@spaces.GPU
def fn_clearvoice_se(input_wav, sr):
    if sr == "16000":
        myClearVoice = ClearVoice(task='speech_enhancement', model_names=['FRCRN_SE_16K'])
        fs = 16000
    else:
        myClearVoice = ClearVoice(task='speech_enhancement', model_names=['MossFormer2_SE_48K'])
        fs = 48000
    output_wav_dict = myClearVoice(input_path=input_wav, online_write=False)
    if isinstance(output_wav_dict, dict):
        key = next(iter(output_wav_dict))
        output_wav = output_wav_dict[key]
    else:
        output_wav = output_wav_dict
    sf.write('enhanced.wav', output_wav, fs)
    return 'enhanced.wav'

@spaces.GPU
def fn_clearvoice_ss(input_wav):
    myClearVoice = ClearVoice(task='speech_separation', model_names=['MossFormer2_SS_16K'])
    output_wav_dict = myClearVoice(input_path=input_wav, online_write=False)
    if isinstance(output_wav_dict, dict):
        key = next(iter(output_wav_dict))
        output_wav_list = output_wav_dict[key]
        output_wav_s1 = output_wav_list[0]
        output_wav_s2 = output_wav_list[1]
    else:
        output_wav_list = output_wav_dict
        output_wav_s1 = output_wav_list[0]
        output_wav_s2 = output_wav_list[1]
    sf.write('separated_s1.wav', output_wav_s1, 16000)
    sf.write('separated_s2.wav', output_wav_s2, 16000)
    return "separated_s1.wav", "separated_s2.wav"

def find_mp4_files(directory):
    mp4_files = []
    
    # Walk through the directory and its subdirectories
    for root, dirs, files in os.walk(directory):
        for file in files:
            # Check if the file ends with .mp4
            if file.endswith(".mp4") and file[:3] == 'est':
                mp4_files.append(os.path.join(root, file))
    
    return mp4_files
    
@spaces.GPU(duration=300)
def fn_clearvoice_tse(input_video):
    myClearVoice = ClearVoice(task='target_speaker_extraction', model_names=['AV_MossFormer2_TSE_16K'])
    #output_wav_dict = 
    print(f'input_video: {input_video}')
    myClearVoice(input_path=input_video, online_write=True, output_path='path_to_output_videos_tse')

    output_list = find_mp4_files('path_to_output_videos_tse/')
    print(output_list)
    
    return output_list

demo = gr.Blocks()

se_demo = gr.Interface(
    fn=fn_clearvoice_se,
    inputs = [
        gr.Audio(label="Input Audio", type="filepath"),
        gr.Dropdown(
            ["16000", "48000"], value=["16000"], multiselect=False, label="Sampling Rate", info="Choose the sampling rate for your output."
        ),
    ],
    outputs = [
        gr.Audio(label="Output Audio", type="filepath"),
    ],
    title = "ClearVoice: Speech Enhancement",
    description = ("Gradio demo for Speech enhancement with ClearVoice. The models support audios with 16 kHz (FRCRN backbone) and 48 kHz (MossFormer2 backbone) sampling rates. "
                   "We provide the generalized models trained on large scale of data for handling various of background environments. "
                   "To test it, simply upload your audio, or click one of the examples to load them. Read more at the links below."),
    article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2206.07293' target='_blank'>FRCRN: Boosting Feature Representation Using Frequency Recurrence for Monaural Speech Enhancement</a> | <a href='https://github.com/alibabasglab/FRCRN' target='_blank'>Github Repo</a></p>"
              ),
    examples = [
        ["examples/mandarin_speech_16kHz.wav", "16000"],
        ["examples/english_speech_48kHz.wav", "48000"],
    ],
    cache_examples = True,
)

ss_demo = gr.Interface(
    fn=fn_clearvoice_ss,
    inputs = [
        gr.Audio(label="Input Audio", type="filepath"),
    ],
    outputs = [
        gr.Audio(label="Output Audio", type="filepath"),
        gr.Audio(label="Output Audio", type="filepath"),
    ],
    title = "ClearVoice: Speech Separation",
    description = ("Gradio demo for Speech separation with ClearVoice. The model (MossFormer2 backbone) supports 2 speakers' audio mixtures with 16 kHz sampling rate. "
                   "We provide the generalized models trained on large scale of data for handling independent speakers and various of background environments. "
                    "To test it, simply upload your audio, or click one of the examples to load them. Read more at the links below."),
    article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2302.11824' target='_blank'>MossFormer: Pushing the Performance Limit of Monaural Speech Separation using Gated Single-Head Transformer with Convolution-Augmented Joint Self-Attentions</a> | <a href='https://github.com/alibabasglab/MossFormer' target='_blank'>Github Repo</a></p>"
              "<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> | <a href='https://github.com/alibabasglab/MossFormer2' target='_blank'>Github Repo</a></p>"),
    examples = [
        ['examples/female_female_speech.wav'],
        ['examples/female_male_speech.wav'],
    ],
    cache_examples = True,
)

tse_demo = gr.Interface(
    fn=fn_clearvoice_tse,
    inputs = [
        gr.Video(label="Input Video"),
    ],
    outputs = [
        gr.Gallery(label="Output Video List")
    ],
    title = "ClearVoice: Audio-visual speaker extraction",
    description = ("Gradio demo for audio-visual speaker extraction with ClearVoice. The model (AV_MossFormer2_TSE_16K) supports 16 kHz sampling rate. "
                   "We provide the generalized models trained on mid-scale of data for handling independent speakers and various of background environments. "
                    "To test it, simply upload your video, or click one of the examples to load them. Read more at the links below."),
    article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2302.11824' target='_blank'>MossFormer: Pushing the Performance Limit of Monaural Speech Separation using Gated Single-Head Transformer with Convolution-Augmented Joint Self-Attentions</a> | <a href='https://github.com/alibabasglab/MossFormer' target='_blank'>Github Repo</a></p>"
              "<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> | <a href='https://github.com/alibabasglab/MossFormer2' target='_blank'>Github Repo</a></p>"),
    examples = [
        ['examples/female_female_speech.wav'],
        ['examples/female_male_speech.wav'],
    ],
    cache_examples = True,
)

with demo:
    #gr.TabbedInterface([se_demo], ["Speech Enhancement"])
    gr.TabbedInterface([se_demo, ss_demo, tse_demo], ["Speech Enhancement", "Speech Separation", "Target Speaker Extraction"])

demo.launch()