Spaces:
Running
Running
#!/usr/bin/python3 | |
# -*- coding: utf-8 -*- | |
""" | |
任意格式转到 wav 8000 int16 格式。 | |
多通道转单通道。 | |
音频 pad 加长。 | |
""" | |
import argparse | |
import json | |
from pathlib import Path | |
import platform | |
import tempfile | |
from typing import Tuple, List | |
import uuid | |
import gradio as gr | |
import numpy as np | |
from scipy.io import wavfile | |
from project_settings import project_path | |
from toolbox.audio_edit.info import get_audio_info, engine_to_function as info_engine_to_function | |
from toolbox.audio_edit.convert import audio_convert, engine_to_function as cvt_engine_to_function | |
from toolbox.audio_edit.speech_speed import change_speech_speed, engine_to_function as speed_engine_to_function | |
from toolbox.audio_edit.volume import change_volume, engine_to_function as volume_engine_to_function | |
from toolbox.audio_edit.augment import mix_speech_and_noise | |
from toolbox.audio_edit.reverb import reverb, engine_to_function as reverb_engine_to_function | |
def get_args(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument( | |
"--examples_dir", | |
default=(project_path / "data/examples").as_posix(), | |
type=str, | |
) | |
args = parser.parse_args() | |
return args | |
def save_input_audio(sample_rate: int, signal: np.ndarray) -> str: | |
temp_audio_dir = Path(tempfile.gettempdir()) / "input_audio" | |
temp_audio_dir.mkdir(parents=True, exist_ok=True) | |
filename = temp_audio_dir / f"{uuid.uuid4()}.wav" | |
filename = filename.as_posix() | |
wavfile.write( | |
filename, | |
sample_rate, signal | |
) | |
return filename | |
def when_click_get_audio_info(audio_t, engine: str): | |
sample_rate, signal = audio_t | |
filename = save_input_audio(sample_rate, signal) | |
message = "success" | |
try: | |
info: dict = get_audio_info(filename, engine) | |
result = json.dumps(info, ensure_ascii=False, indent=4) | |
except Exception as e: | |
result = None | |
message = f"failed. error type: {type(e)}, error text: {str(e)}" | |
return result, message | |
def when_click_audio_convert(audio_t, | |
to_sample_rate: int = 8000, | |
sample_width: int = 2, | |
channels: str = "0", | |
engine: str = "librosa", | |
) -> Tuple[str, str, str, str]: | |
sample_rate, signal = audio_t | |
filename = save_input_audio(sample_rate, signal) | |
message = "success" | |
try: | |
output_file: str = audio_convert(filename, | |
to_sample_rate=to_sample_rate, | |
sample_width=sample_width, | |
channels=channels, | |
engine=engine, | |
) | |
origin_audio_info: dict = get_audio_info(filename, engine="wave") | |
origin_audio_info = json.dumps(origin_audio_info, ensure_ascii=False, indent=4) | |
output_audio_info: dict = get_audio_info(output_file, engine="wave") | |
output_audio_info = json.dumps(output_audio_info, ensure_ascii=False, indent=4) | |
except Exception as e: | |
output_file = None | |
origin_audio_info = None | |
output_audio_info = None | |
message = f"failed. error type: {type(e)}, error text: {str(e)}" | |
return output_file, origin_audio_info, output_audio_info, message | |
def when_click_change_speech_speed(audio_t, speed: float = 1.0, engine: str = "librosa"): | |
sample_rate, signal = audio_t | |
filename = save_input_audio(sample_rate, signal) | |
message = "success" | |
try: | |
output_file: str = change_speech_speed(filename, speed, engine) | |
origin_audio_info: dict = get_audio_info(filename, engine="pydub") | |
origin_audio_info = json.dumps(origin_audio_info, ensure_ascii=False, indent=4) | |
output_audio_info: dict = get_audio_info(output_file, engine="pydub") | |
output_audio_info = json.dumps(output_audio_info, ensure_ascii=False, indent=4) | |
except Exception as e: | |
output_file = None | |
origin_audio_info = None | |
output_audio_info = None | |
message = f"failed. error type: {type(e)}, error text: {str(e)}" | |
return output_file, origin_audio_info, output_audio_info, message | |
def when_click_change_volume(audio_t: str, | |
radio: float = 1.0, | |
decibel: float = 0.0, | |
reference: str = None, | |
engine: str = "by_ffmpy_by_db", | |
): | |
sample_rate, signal = audio_t | |
filename = save_input_audio(sample_rate, signal) | |
message = "success" | |
try: | |
output_file: str = change_volume(filename, radio, decibel, reference, engine) | |
except Exception as e: | |
output_file = None | |
message = f"failed. error type: {type(e)}, error text: {str(e)}" | |
return output_file, message | |
def when_click_pad_audio(audio_t, pad_seconds: int = 10, pad_mode: str = "zero"): | |
sample_rate, signal = audio_t | |
message = "success" | |
pad_signal = signal | |
try: | |
if not signal.ndim == 1: | |
raise AssertionError | |
pad_length = int(pad_seconds * sample_rate) | |
if pad_mode == "zero": | |
pad = np.zeros(shape=(pad_length,), dtype=signal.dtype) | |
elif pad_mode == "repeat": | |
signal_length = len(signal) | |
if pad_length <= signal_length: | |
pad = signal[:pad_length] | |
else: | |
a = pad_length // signal_length | |
pad = np.concat([signal] * int(a + 1), axis=-1) | |
pad = pad[:pad_length] | |
else: | |
raise NotImplementedError | |
pad_signal = np.concat([signal, pad], axis=-1) | |
except Exception as e: | |
message = f"failed. error type: {type(e)}, error text: {str(e)}" | |
return (sample_rate, pad_signal), message | |
def when_click_reverb(audio_t, kwargs: str, engine: str): | |
sample_rate, signal = audio_t | |
message = "success" | |
try: | |
signal = np.array(signal / (1 << 15), dtype=np.float32) | |
kwargs = json.loads(kwargs) | |
reverberated_audio = reverb( | |
signal=signal, | |
sample_rate=sample_rate, | |
engine=engine, | |
**kwargs, | |
) | |
reverberated_audio = np.array(reverberated_audio * (1 << 15), dtype=np.int16) | |
except Exception as e: | |
reverberated_audio = None | |
message = f"failed. error type: {type(e)}, error text: {str(e)}" | |
return (sample_rate, reverberated_audio), message | |
def when_click_mix_speech_and_noise(speech_t, noise_t, snr_db: float): | |
sample_rate1, speech = speech_t | |
sample_rate2, noise = noise_t | |
message = "success" | |
mix_signal = speech | |
try: | |
if sample_rate1 != sample_rate2: | |
raise AssertionError(f"sr of speech: {sample_rate1}, sr of noise: {sample_rate2}") | |
if speech.dtype == np.int16: | |
speech = np.array(speech, dtype=np.float32) | |
speech /= (1 << 15) | |
else: | |
raise NotImplementedError | |
if noise.dtype == np.int16: | |
noise = np.array(noise, dtype=np.float32) | |
noise /= (1 << 15) | |
else: | |
raise NotImplementedError | |
mix_signal = mix_speech_and_noise(speech, noise, snr_db) | |
mix_signal = np.array(mix_signal * (1 << 15), dtype=np.int16) | |
except Exception as e: | |
message = f"failed. error type: {type(e)}, error text: {str(e)}" | |
# np.int16 | |
return (sample_rate1, mix_signal), message | |
audio_convert_examples = [ | |
[ | |
(project_path / "data/examples/default/audio_0_2.wav").as_posix(), | |
8000, 2, "0", "librosa" | |
] | |
] | |
change_volume_examples = [ | |
[ | |
(project_path / "data/examples/default/audio_0_3_clone_from_audio_0_2.wav").as_posix(), | |
1.0, -10.0, | |
None, | |
"by_ffmpy_by_db" | |
], | |
[ | |
(project_path / "data/examples/default/audio_0_3_clone_from_audio_0_2.wav").as_posix(), | |
0.3, 0.0, | |
None, | |
"by_ffmpy_by_radio" | |
], | |
[ | |
(project_path / "data/examples/default/audio_0_3_clone_from_audio_0_2.wav").as_posix(), | |
1.0, -10.0, | |
None, | |
"by_pydub_by_db" | |
], | |
[ | |
(project_path / "data/examples/default/audio_0_3_clone_from_audio_0_2.wav").as_posix(), | |
1.0, 0.0, | |
(project_path / "data/examples/default/audio_0_2.wav").as_posix(), | |
"by_pydub_by_reference" | |
] | |
] | |
pad_audio_examples = [ | |
[ | |
(project_path / "data/examples/default/audio_0_3_clone_from_audio_0_2.wav").as_posix(), | |
10, "zero", | |
], | |
] | |
reverb_examples = [ | |
[ | |
(project_path / "data/examples/default/audio_0_2.wav").as_posix(), | |
'{\n "room_size": 0.25,\n "damping": 0.5,\n "width": 1.0,\n "dry_level": 0.4,\n "wet_level": 0.6,\n "freeze_mode": false\n}', | |
"pedalboard", | |
], | |
[ | |
(project_path / "data/examples/default/audio_0_2.wav").as_posix(), | |
'{\n "room_size": [4.0, 6.0],\n "source_position": [2.5, 4.5],\n "microphone_array": [\n [1.5, 1.5],\n [2.5, 1.5]\n ],\n "output_microphone_idx": 0\n}', | |
"pyroomacoustics", | |
] | |
] | |
mix_speech_and_noise_examples = [ | |
[ | |
(project_path / "data/examples/mix/speech/000f62f5-5b05-4494-a8db-0eaca3ebd871_th-TH_1678353399860.wav").as_posix(), | |
(project_path / "data/examples/mix/noise/000e2a2e-43c8-4752-8e26-34207fa6e9e4_th-TH_1678244573769.wav").as_posix(), | |
-5, | |
], | |
[ | |
(project_path / "data/examples/mix/speech/0000c655-3a8e-4196-bc31-c01fa8d115cc_th-TH_1678768644585.wav").as_posix(), | |
(project_path / "data/examples/mix/noise/000f28d7-2129-49d5-9942-16ebf60e8285_th-TH_1678343313388.wav").as_posix(), | |
0, | |
], | |
[ | |
(project_path / "data/examples/mix/speech/001df4d1-9f7a-4e78-adc9-ef26d07eba60_th-TH_1667878032.0303788.wav").as_posix(), | |
(project_path / "data/examples/mix/noise/0001f9f2-3626-427f-8ae5-105d81fcb5a3_th-TH_1678772646723.wav").as_posix(), | |
5, | |
], | |
[ | |
(project_path / "data/examples/mix/speech/001ef59d-b266-4409-b89c-627e3d7fb27d_th-TH_1678356022482.wav").as_posix(), | |
(project_path / "data/examples/mix/noise/00240453-cd58-4059-9a38-d00583b879c7_th-TH_1678168729318.wav").as_posix(), | |
10, | |
] | |
] | |
def main(): | |
args = get_args() | |
# examples | |
examples_dir = Path(args.examples_dir) | |
# choices | |
info_choices = list(info_engine_to_function.keys()) | |
cvt_choices = list(cvt_engine_to_function.keys()) | |
speed_choices = list(speed_engine_to_function.keys()) | |
volume_choices = list(volume_engine_to_function.keys()) | |
reverb_choices = list(reverb_engine_to_function.keys()) | |
# ui | |
with gr.Blocks() as blocks: | |
with gr.Tabs(): | |
with gr.TabItem("info"): | |
with gr.Row(): | |
with gr.Column(variant="panel", scale=5): | |
info_audio = gr.Audio(label="audio") | |
info_engine = gr.Dropdown(choices=info_choices, value=info_choices[0], label="engine") | |
info_button = gr.Button(variant="primary") | |
with gr.Column(variant="panel", scale=5): | |
info_output = gr.Text(label="output") | |
info_log = gr.Text(label="log") | |
gr.Examples( | |
examples=[ | |
[filename.as_posix(), "wave"] | |
for filename in examples_dir.glob("**/*.wav") | |
], | |
inputs=[info_audio, info_engine], | |
outputs=[info_output, info_log], | |
fn=when_click_get_audio_info, | |
) | |
info_button.click( | |
when_click_get_audio_info, | |
inputs=[info_audio, info_engine], | |
outputs=[info_output, info_log] | |
) | |
with gr.TabItem("convert"): | |
with gr.Row(): | |
with gr.Column(variant="panel", scale=5): | |
cvt_audio = gr.Audio(label="audio") | |
with gr.Row(): | |
cvt_sample_rate = gr.Dropdown(choices=[8000], value=8000, label="sample_rate") | |
cvt_sample_width = gr.Dropdown(choices=[2], value=2, label="sample_width") | |
cvt_channels = gr.Text( | |
value="0", label="channels", | |
info = "The channels to be retained, separated by commas, such as `0,1`" | |
) | |
cvt_engine = gr.Dropdown(choices=cvt_choices, value=cvt_choices[0], label="engine") | |
cvt_button = gr.Button(variant="primary") | |
with gr.Column(variant="panel", scale=5): | |
cvt_output_audio = gr.Audio(label="output_audio") | |
cvt_origin_audio_info = gr.Text(label="origin_audio_info") | |
cvt_output_audio_info = gr.Text(label="output_audio_info") | |
cvt_log = gr.Text(label="log") | |
gr.Examples( | |
examples=audio_convert_examples, | |
inputs=[ | |
cvt_audio, | |
cvt_sample_rate, cvt_sample_width, cvt_channels, | |
cvt_engine, | |
], | |
outputs=[ | |
cvt_output_audio, | |
cvt_origin_audio_info, cvt_output_audio_info, | |
cvt_log | |
], | |
fn=when_click_audio_convert, | |
) | |
cvt_button.click( | |
when_click_audio_convert, | |
inputs=[ | |
cvt_audio, | |
cvt_sample_rate, cvt_sample_width, cvt_channels, | |
cvt_engine, | |
], | |
outputs=[ | |
cvt_output_audio, | |
cvt_origin_audio_info, cvt_output_audio_info, | |
cvt_log | |
], | |
) | |
with gr.TabItem("speech_speed"): | |
with gr.Row(): | |
with gr.Column(variant="panel", scale=5): | |
speech_speed_audio = gr.Audio(label="audio") | |
with gr.Row(): | |
speech_speed_speed = gr.Slider(minimum=0.0, maximum=4.0, value=1.0, label="speed") | |
speech_speed_engine = gr.Dropdown(choices=speed_choices, value=speed_choices[0], label="engine") | |
speech_speed_button = gr.Button(variant="primary") | |
with gr.Column(variant="panel", scale=5): | |
speech_speed_output_audio = gr.Audio(label="output_audio") | |
speech_speed_origin_audio_info = gr.Text(label="origin_audio_info") | |
speech_speed_output_audio_info = gr.Text(label="output_audio_info") | |
speech_speed_log = gr.Text(label="log") | |
gr.Examples( | |
examples=[ | |
[filename.as_posix(), 0.5] | |
for filename in examples_dir.glob("**/*.wav") | |
], | |
inputs=[speech_speed_audio, speech_speed_speed, speech_speed_engine], | |
outputs=[ | |
speech_speed_output_audio, | |
speech_speed_origin_audio_info, speech_speed_output_audio_info, | |
speech_speed_log, | |
], | |
fn=when_click_change_speech_speed, | |
) | |
speech_speed_button.click( | |
when_click_change_speech_speed, | |
inputs=[speech_speed_audio, speech_speed_speed, speech_speed_engine], | |
outputs=[ | |
speech_speed_output_audio, | |
speech_speed_origin_audio_info, speech_speed_output_audio_info, | |
speech_speed_log, | |
] | |
) | |
with gr.TabItem("volume"): | |
with gr.Row(): | |
with gr.Column(variant="panel", scale=5): | |
volume_speed_audio = gr.Audio(label="audio") | |
with gr.Row(): | |
with gr.Column(): | |
volume_radio = gr.Slider(minimum=0.0, maximum=3.0, value=1.0, step=0.1, label="radio") | |
volume_decibel = gr.Slider(minimum=-30.0, maximum=30.0, value=0.0, step=0.1, label="decibel") | |
volume_engine = gr.Dropdown(choices=volume_choices, value=volume_choices[0], label="engine") | |
with gr.Column(): | |
volume_reference = gr.File(label="reference") | |
volume_button = gr.Button(variant="primary") | |
with gr.Column(variant="panel", scale=5): | |
volume_output_audio = gr.Audio(label="output_audio") | |
volume_log = gr.Text(label="log") | |
gr.Examples( | |
examples=change_volume_examples, | |
inputs=[volume_speed_audio, volume_radio, volume_decibel, volume_reference, volume_engine], | |
outputs=[ | |
volume_output_audio, | |
volume_log, | |
], | |
fn=when_click_change_volume, | |
) | |
volume_button.click( | |
when_click_change_volume, | |
inputs=[volume_speed_audio, volume_radio, volume_decibel, volume_reference, volume_engine], | |
outputs=[ | |
volume_output_audio, | |
volume_log, | |
] | |
) | |
with gr.TabItem("pad"): | |
with gr.Row(): | |
with gr.Column(variant="panel", scale=5): | |
pad_audio = gr.Audio(label="audio") | |
with gr.Row(): | |
pad_seconds = gr.Slider(minimum=0, maximum=100, value=20, step=0.1, label="pad_seconds") | |
pad_mode = gr.Dropdown(choices=["zero", "repeat"], value="zero", label="pad_mode") | |
pad_button = gr.Button(variant="primary") | |
with gr.Column(variant="panel", scale=5): | |
pad_output_audio = gr.Audio(label="output_audio") | |
pad_log = gr.Text(label="log") | |
gr.Examples( | |
examples=pad_audio_examples, | |
inputs=[pad_audio, pad_seconds, pad_mode], | |
outputs=[ | |
pad_output_audio, pad_log | |
], | |
fn=when_click_pad_audio, | |
) | |
pad_button.click( | |
when_click_pad_audio, | |
inputs=[pad_audio, pad_seconds, pad_mode], | |
outputs=[ | |
pad_output_audio, pad_log | |
], | |
) | |
with gr.TabItem("reverb"): | |
with gr.Row(): | |
with gr.Column(variant="panel", scale=5): | |
reverb_audio = gr.Audio(label="audio") | |
reverb_kwargs = gr.Textbox(lines=8, label="kwargs") | |
reverb_engine = gr.Dropdown(choices=reverb_choices, value=reverb_choices[0], label="engine") | |
reverb_button = gr.Button(variant="primary") | |
with gr.Column(variant="panel", scale=5): | |
reverb_output_audio = gr.Audio(label="output_audio") | |
reverb_log = gr.Text(label="log") | |
gr.Examples( | |
examples=reverb_examples, | |
inputs=[reverb_audio, reverb_kwargs, reverb_engine], | |
outputs=[ | |
reverb_output_audio, reverb_log | |
], | |
fn=when_click_reverb, | |
) | |
reverb_button.click( | |
when_click_reverb, | |
inputs=[reverb_audio, reverb_kwargs, reverb_engine], | |
outputs=[ | |
reverb_output_audio, reverb_log | |
], | |
) | |
with gr.TabItem("mix"): | |
with gr.Row(): | |
with gr.Column(variant="panel", scale=5): | |
mix_speed_audio = gr.Audio(label="speech") | |
mix_noise_audio = gr.Audio(label="noise") | |
with gr.Row(): | |
mix_snr_db = gr.Slider(minimum=-10, maximum=20, value=10, step=0.1, label="snr_db") | |
mix_button = gr.Button(variant="primary") | |
with gr.Column(variant="panel", scale=5): | |
mix_output_audio = gr.Audio(label="output_audio") | |
mix_log = gr.Text(label="log") | |
gr.Examples( | |
examples=mix_speech_and_noise_examples, | |
inputs=[mix_speed_audio, mix_noise_audio, mix_snr_db], | |
outputs=[ | |
mix_output_audio, mix_log | |
], | |
fn=when_click_mix_speech_and_noise, | |
) | |
mix_button.click( | |
when_click_mix_speech_and_noise, | |
inputs=[mix_speed_audio, mix_noise_audio, mix_snr_db], | |
outputs=[ | |
mix_output_audio, mix_log | |
], | |
) | |
# http://127.0.0.1:7860/ | |
blocks.queue().launch( | |
share=False if platform.system() == "Windows" else False, | |
# server_name="127.0.0.1" if platform.system() == "Windows" else "0.0.0.0", | |
server_name="0.0.0.0", | |
server_port=7860, | |
) | |
return | |
if __name__ == "__main__": | |
main() | |