File size: 6,865 Bytes
02c7bdf
bdaf47a
02c7bdf
7b02833
02c7bdf
a1655f3
02c7bdf
e537531
e63a812
3e28721
e63a812
 
 
 
 
a9e592e
132a2a9
 
 
 
 
e63a812
02c7bdf
3192961
e537531
b78b7d0
 
 
 
 
4554491
 
 
b78b7d0
4554491
 
 
 
 
 
b78b7d0
f8605aa
 
 
 
 
 
 
b02e870
f8605aa
 
 
 
9e426ab
0e2caa6
d431bb0
f8605aa
 
d431bb0
 
f8605aa
27ef3d7
3ee12a7
b02e870
f8605aa
3192961
 
3956066
b78b7d0
 
e805751
e63a812
3e28721
e63a812
b78b7d0
 
e805751
b78b7d0
43e8301
2e5499c
8310825
43e8301
 
b78b7d0
dc8fb4a
 
b78b7d0
 
 
 
 
8bb6908
3956066
e805751
3956066
 
e805751
 
3956066
43e8301
2e5499c
8310825
43e8301
 
3956066
6320c59
 
d04ae35
3956066
 
b78b7d0
f8605aa
7be4073
f8605aa
d431bb0
f8605aa
 
56f7076
f8605aa
43e8301
2e5499c
8310825
9e426ab
 
f8605aa
c7ca2a0
 
f8605aa
 
 
 
99710ec
8310825
49effbd
0f8dddd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import torch
import soundfile as sf
import gradio as gr
import spaces
from clearvoice import ClearVoice
import os

@spaces.GPU
def fn_clearvoice_se(input_wav, sr):
    if sr == "16000 Hz":
        myClearVoice = ClearVoice(task='speech_enhancement', model_names=['FRCRN_SE_16K'])
        fs = 16000
    else:
        myClearVoice = ClearVoice(task='speech_enhancement', model_names=['MossFormer2_SE_48K'])
        fs = 48000
    output_wav_dict = myClearVoice(input_path=input_wav, online_write=False)
    if isinstance(output_wav_dict, dict):
        key = next(iter(output_wav_dict))
        output_wav = output_wav_dict[key]
    else:
        output_wav = output_wav_dict
    sf.write('enhanced.wav', output_wav, fs)
    return 'enhanced.wav'

@spaces.GPU
def fn_clearvoice_ss(input_wav):
    myClearVoice = ClearVoice(task='speech_separation', model_names=['MossFormer2_SS_16K'])
    output_wav_dict = myClearVoice(input_path=input_wav, online_write=False)
    if isinstance(output_wav_dict, dict):
        key = next(iter(output_wav_dict))
        output_wav_list = output_wav_dict[key]
        output_wav_s1 = output_wav_list[0]
        output_wav_s2 = output_wav_list[1]
    else:
        output_wav_list = output_wav_dict
        output_wav_s1 = output_wav_list[0]
        output_wav_s2 = output_wav_list[1]
    sf.write('separated_s1.wav', output_wav_s1, 16000)
    sf.write('separated_s2.wav', output_wav_s2, 16000)
    return "separated_s1.wav", "separated_s2.wav"

def find_mp4_files(directory):
    mp4_files = []
    
    # Walk through the directory and its subdirectories
    for root, dirs, files in os.walk(directory):
        for file in files:
            # Check if the file ends with .mp4
            if file.endswith(".mp4") and file[:3] == 'est':
                mp4_files.append(os.path.join(root, file))
    
    return mp4_files
    

@spaces.GPU()
def fn_clearvoice_tse(input_video):
    myClearVoice = ClearVoice(task='target_speaker_extraction', model_names=['AV_MossFormer2_TSE_16K'])
    #output_wav_dict = 
    print(f'input_video: {input_video}')
    myClearVoice(input_path=input_video, online_write=True, output_path='path_to_output_videos_tse')

    output_list = find_mp4_files(f'path_to_output_videos_tse/AV_MossFormer2_TSE_16K/{os.path.basename(input_video).split(".")[0]}/')
    
    return output_list

demo = gr.Blocks()

se_demo = gr.Interface(
    fn=fn_clearvoice_se,
    inputs = [
        gr.Audio(label="Input Audio", type="filepath"),
        gr.Dropdown(
            ["16000 Hz", "48000 Hz"], value="16000 Hz", multiselect=False, info="Choose a sampling rate for your output."
        ),
    ],
    outputs = [
        gr.Audio(label="Output Audio", type="filepath"),
    ],
    title = "<a href='https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice' target='_blank'>ClearVoice<a/>: Speech Enhancement",
    description = ("ClearVoice ([Github Repo](https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice)) is AI-powered and extracts clear speech from background noise for enhanced speech quality. It supports both 16 kHz and 48 kHz audio outputs. "
                   "To try it, simply upload your audio, or click one of the examples. "),
    article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2206.07293' target='_blank'>FRCRN: Boosting Feature Representation Using Frequency Recurrence for Monaural Speech Enhancement</a> </p>"
              "<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> </p>"),
    examples = [
        ["examples/mandarin_speech_16kHz.wav", "16000 Hz"],
        ["examples/english_speech_48kHz.wav", "48000 Hz"],
    ],
    cache_examples = True,
)

ss_demo = gr.Interface(
    fn=fn_clearvoice_ss,
    inputs = [
        gr.Audio(label="Input Audio", type="filepath"),
    ],
    outputs = [
        gr.Audio(label="Output Audio", type="filepath"),
        gr.Audio(label="Output Audio", type="filepath"),
    ],
    title = "<a href='https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice' target='_blank'>ClearVoice<a/>: Speech Separation",
    description = ("ClearVoice ([Github Repo](https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice)) is powered by AI and separates individual speech from mixed audio. It supports 16 kHz and two output streams. "
                    "To try it, simply upload your audio, or click one of the examples. "),
    article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2302.11824' target='_blank'>MossFormer: Pushing the Performance Limit of Monaural Speech Separation using Gated Single-Head Transformer with Convolution-Augmented Joint Self-Attentions</a> </p>"
              "<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> </p>"),
    examples = [
        ['examples/female_female_speech.wav'],
        ['examples/female_male_speech.wav'],
    ],
    cache_examples = True,
)

tse_demo = gr.Interface(
    fn=fn_clearvoice_tse,
    inputs = [
        gr.Video(label="Input Video"),
    ],
    outputs = [
        gr.Gallery(label="Output Video List")
    ],
    title = "<a href='https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice' target='_blank'>ClearVoice<a/>: Audio-Visual Speaker Extraction",
    description = ("ClearVoice ([Github Repo](https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice)) is AI-powered and extracts each speaker's voice from a multi-speaker video using facial recognition. "
                    "To try it, simply upload your video, or click one of the examples. "),
    # article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2302.11824' target='_blank'>MossFormer: Pushing the Performance Limit of Monaural Speech Separation using Gated Single-Head Transformer with Convolution-Augmented Joint Self-Attentions</a> | <a href='https://github.com/alibabasglab/MossFormer' target='_blank'>Github Repo</a></p>"
    #           "<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> | <a href='https://github.com/alibabasglab/MossFormer2' target='_blank'>Github Repo</a></p>"),
    examples = [
        ['examples/001.mp4'],
        ['examples/002.mp4'],
    ],
    cache_examples = True,
)

with demo:
    gr.TabbedInterface([se_demo, ss_demo, tse_demo], ["Task 1: Speech Enhancement", "Task 2: Speech Separation", "Task 3: Audio-Visual Speaker Extraction"])

demo.launch()