|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import gradio as gr |
|
import wesep |
|
import soundfile |
|
import torchaudio |
|
import os |
|
from scipy.signal import resample |
|
|
|
input_path = "./audios" |
|
output_path = "./extracted" |
|
|
|
if not os.path.exists(input_path): |
|
os.mkdir(input_path) |
|
|
|
if not os.path.exists(output_path): |
|
os.mkdir(output_path) |
|
|
|
|
|
en_model = wesep.load_model("english") |
|
|
|
def save_to_file(audio,filename,target_sr=16000): |
|
audio_path = os.path.join(input_path,filename) |
|
soundfile.write(audio_path,audio[1],audio[0]) |
|
pcm, sample_rate = torchaudio.load(audio_path) |
|
transform = torchaudio.transforms.Resample( |
|
orig_freq=audio[0], |
|
new_freq=target_sr) |
|
pcm = transform(pcm) |
|
torchaudio.save(audio_path, pcm, target_sr) |
|
|
|
return audio_path |
|
|
|
|
|
def speaker_extraction(audio1, audio2, mixture, select_speaker='#1'): |
|
if audio1 == None or audio2 == None or mixture == None: |
|
print("??") |
|
return gr.Warning("The audio file cannot be empty, please upload a valid audio file. 音频文件不能为空,请上传有效的音频文件。") |
|
|
|
audio_path1 = save_to_file(audio1,"enroll_1.wav",16000) |
|
audio_path2 = save_to_file(audio2,"enroll_2.wav",16000) |
|
audio_mixture = save_to_file(mixture,"mixture.wav",16000) |
|
|
|
model = en_model |
|
|
|
if select_speaker == '#1': |
|
select_speaker = audio_path1 |
|
elif select_speaker == '#2': |
|
select_speaker = audio_path2 |
|
|
|
speech = model.extract_speech(audio_mixture,select_speaker) |
|
audio_speech = output_path + "/speech.wav" |
|
soundfile.write(audio_speech,speech[0],16000) |
|
|
|
|
|
return audio_speech |
|
|
|
|
|
inputs = [ |
|
gr.Audio( |
|
show_download_button = True, |
|
label='Enroll Speaker#1', |
|
), |
|
gr.Audio( |
|
show_download_button = True, |
|
label='Enroll Speaker#2'), |
|
gr.Audio( |
|
show_download_button = True, |
|
label='Mixture'), |
|
gr.Radio(['#1', '#2'], label='Extract Speaker #'), |
|
] |
|
|
|
output = gr.Audio(type="filepath",label="Extract Speaker") |
|
|
|
|
|
description = ("<p>WeSep Demo ! Try it with your own voice ! Note: We recommend that the audio length be greater than 5s !</p>") |
|
|
|
article = ( |
|
"<p style='text-align: center'>" |
|
"<a href='https://github.com/wenet-e2e/wesep' target='_blank'>Github: Learn more about WeSep</a>" |
|
"</p>") |
|
|
|
examples = [ |
|
['examples/enroll_1.wav', 'examples/enroll_2.wav', 'examples/mixture.wav','#1'], |
|
['examples/enroll1_zh.wav', 'examples/enroll2_zh.wav', 'examples/mixture_zh.wav','#2'], |
|
|
|
] |
|
|
|
interface = gr.Interface( |
|
fn=speaker_extraction, |
|
inputs=inputs, |
|
outputs=output, |
|
title="Speaker Extraction in WeSep : 基于 WeSep 的说话人提取", |
|
description=description, |
|
article=article, |
|
examples=examples |
|
|
|
) |
|
|
|
interface.launch() |
|
|