Spaces:
Running
on
Zero
Running
on
Zero
File size: 6,865 Bytes
02c7bdf bdaf47a 02c7bdf 7b02833 02c7bdf a1655f3 02c7bdf e537531 e63a812 3e28721 e63a812 a9e592e 132a2a9 e63a812 02c7bdf 3192961 e537531 b78b7d0 4554491 b78b7d0 4554491 b78b7d0 f8605aa b02e870 f8605aa 9e426ab 0e2caa6 d431bb0 f8605aa d431bb0 f8605aa 27ef3d7 3ee12a7 b02e870 f8605aa 3192961 3956066 b78b7d0 e805751 e63a812 3e28721 e63a812 b78b7d0 e805751 b78b7d0 43e8301 2e5499c 8310825 43e8301 b78b7d0 dc8fb4a b78b7d0 8bb6908 3956066 e805751 3956066 e805751 3956066 43e8301 2e5499c 8310825 43e8301 3956066 6320c59 d04ae35 3956066 b78b7d0 f8605aa 7be4073 f8605aa d431bb0 f8605aa 56f7076 f8605aa 43e8301 2e5499c 8310825 9e426ab f8605aa c7ca2a0 f8605aa 99710ec 8310825 49effbd 0f8dddd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
import torch
import soundfile as sf
import gradio as gr
import spaces
from clearvoice import ClearVoice
import os
@spaces.GPU
def fn_clearvoice_se(input_wav, sr):
if sr == "16000 Hz":
myClearVoice = ClearVoice(task='speech_enhancement', model_names=['FRCRN_SE_16K'])
fs = 16000
else:
myClearVoice = ClearVoice(task='speech_enhancement', model_names=['MossFormer2_SE_48K'])
fs = 48000
output_wav_dict = myClearVoice(input_path=input_wav, online_write=False)
if isinstance(output_wav_dict, dict):
key = next(iter(output_wav_dict))
output_wav = output_wav_dict[key]
else:
output_wav = output_wav_dict
sf.write('enhanced.wav', output_wav, fs)
return 'enhanced.wav'
@spaces.GPU
def fn_clearvoice_ss(input_wav):
myClearVoice = ClearVoice(task='speech_separation', model_names=['MossFormer2_SS_16K'])
output_wav_dict = myClearVoice(input_path=input_wav, online_write=False)
if isinstance(output_wav_dict, dict):
key = next(iter(output_wav_dict))
output_wav_list = output_wav_dict[key]
output_wav_s1 = output_wav_list[0]
output_wav_s2 = output_wav_list[1]
else:
output_wav_list = output_wav_dict
output_wav_s1 = output_wav_list[0]
output_wav_s2 = output_wav_list[1]
sf.write('separated_s1.wav', output_wav_s1, 16000)
sf.write('separated_s2.wav', output_wav_s2, 16000)
return "separated_s1.wav", "separated_s2.wav"
def find_mp4_files(directory):
mp4_files = []
# Walk through the directory and its subdirectories
for root, dirs, files in os.walk(directory):
for file in files:
# Check if the file ends with .mp4
if file.endswith(".mp4") and file[:3] == 'est':
mp4_files.append(os.path.join(root, file))
return mp4_files
@spaces.GPU()
def fn_clearvoice_tse(input_video):
myClearVoice = ClearVoice(task='target_speaker_extraction', model_names=['AV_MossFormer2_TSE_16K'])
#output_wav_dict =
print(f'input_video: {input_video}')
myClearVoice(input_path=input_video, online_write=True, output_path='path_to_output_videos_tse')
output_list = find_mp4_files(f'path_to_output_videos_tse/AV_MossFormer2_TSE_16K/{os.path.basename(input_video).split(".")[0]}/')
return output_list
demo = gr.Blocks()
se_demo = gr.Interface(
fn=fn_clearvoice_se,
inputs = [
gr.Audio(label="Input Audio", type="filepath"),
gr.Dropdown(
["16000 Hz", "48000 Hz"], value="16000 Hz", multiselect=False, info="Choose a sampling rate for your output."
),
],
outputs = [
gr.Audio(label="Output Audio", type="filepath"),
],
title = "<a href='https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice' target='_blank'>ClearVoice<a/>: Speech Enhancement",
description = ("ClearVoice ([Github Repo](https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice)) is AI-powered and extracts clear speech from background noise for enhanced speech quality. It supports both 16 kHz and 48 kHz audio outputs. "
"To try it, simply upload your audio, or click one of the examples. "),
article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2206.07293' target='_blank'>FRCRN: Boosting Feature Representation Using Frequency Recurrence for Monaural Speech Enhancement</a> </p>"
"<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> </p>"),
examples = [
["examples/mandarin_speech_16kHz.wav", "16000 Hz"],
["examples/english_speech_48kHz.wav", "48000 Hz"],
],
cache_examples = True,
)
ss_demo = gr.Interface(
fn=fn_clearvoice_ss,
inputs = [
gr.Audio(label="Input Audio", type="filepath"),
],
outputs = [
gr.Audio(label="Output Audio", type="filepath"),
gr.Audio(label="Output Audio", type="filepath"),
],
title = "<a href='https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice' target='_blank'>ClearVoice<a/>: Speech Separation",
description = ("ClearVoice ([Github Repo](https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice)) is powered by AI and separates individual speech from mixed audio. It supports 16 kHz and two output streams. "
"To try it, simply upload your audio, or click one of the examples. "),
article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2302.11824' target='_blank'>MossFormer: Pushing the Performance Limit of Monaural Speech Separation using Gated Single-Head Transformer with Convolution-Augmented Joint Self-Attentions</a> </p>"
"<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> </p>"),
examples = [
['examples/female_female_speech.wav'],
['examples/female_male_speech.wav'],
],
cache_examples = True,
)
tse_demo = gr.Interface(
fn=fn_clearvoice_tse,
inputs = [
gr.Video(label="Input Video"),
],
outputs = [
gr.Gallery(label="Output Video List")
],
title = "<a href='https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice' target='_blank'>ClearVoice<a/>: Audio-Visual Speaker Extraction",
description = ("ClearVoice ([Github Repo](https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice)) is AI-powered and extracts each speaker's voice from a multi-speaker video using facial recognition. "
"To try it, simply upload your video, or click one of the examples. "),
# article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2302.11824' target='_blank'>MossFormer: Pushing the Performance Limit of Monaural Speech Separation using Gated Single-Head Transformer with Convolution-Augmented Joint Self-Attentions</a> | <a href='https://github.com/alibabasglab/MossFormer' target='_blank'>Github Repo</a></p>"
# "<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> | <a href='https://github.com/alibabasglab/MossFormer2' target='_blank'>Github Repo</a></p>"),
examples = [
['examples/001.mp4'],
['examples/002.mp4'],
],
cache_examples = True,
)
with demo:
gr.TabbedInterface([se_demo, ss_demo, tse_demo], ["Task 1: Speech Enhancement", "Task 2: Speech Separation", "Task 3: Audio-Visual Speaker Extraction"])
demo.launch() |