audio_edit / main.py
HoneyTian's picture
update
35db526
raw
history blame
22.1 kB
#!/usr/bin/python3
# -*- coding: utf-8 -*-
"""
任意格式转到 wav 8000 int16 格式。
多通道转单通道。
音频 pad 加长。
"""
import argparse
import json
from pathlib import Path
import platform
import tempfile
from typing import Tuple, List
import uuid
import gradio as gr
import numpy as np
from scipy.io import wavfile
from project_settings import project_path
from toolbox.audio_edit.info import get_audio_info, engine_to_function as info_engine_to_function
from toolbox.audio_edit.convert import audio_convert, engine_to_function as cvt_engine_to_function
from toolbox.audio_edit.speech_speed import change_speech_speed, engine_to_function as speed_engine_to_function
from toolbox.audio_edit.volume import change_volume, engine_to_function as volume_engine_to_function
from toolbox.audio_edit.augment import mix_speech_and_noise
from toolbox.audio_edit.reverb import reverb, engine_to_function as reverb_engine_to_function
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--examples_dir",
default=(project_path / "data/examples").as_posix(),
type=str,
)
args = parser.parse_args()
return args
def save_input_audio(sample_rate: int, signal: np.ndarray) -> str:
temp_audio_dir = Path(tempfile.gettempdir()) / "input_audio"
temp_audio_dir.mkdir(parents=True, exist_ok=True)
filename = temp_audio_dir / f"{uuid.uuid4()}.wav"
filename = filename.as_posix()
wavfile.write(
filename,
sample_rate, signal
)
return filename
def when_click_get_audio_info(audio_t, engine: str):
sample_rate, signal = audio_t
filename = save_input_audio(sample_rate, signal)
message = "success"
try:
info: dict = get_audio_info(filename, engine)
result = json.dumps(info, ensure_ascii=False, indent=4)
except Exception as e:
result = None
message = f"failed. error type: {type(e)}, error text: {str(e)}"
return result, message
def when_click_audio_convert(audio_t,
to_sample_rate: int = 8000,
sample_width: int = 2,
channels: str = "0",
engine: str = "librosa",
) -> Tuple[str, str, str, str]:
sample_rate, signal = audio_t
filename = save_input_audio(sample_rate, signal)
message = "success"
try:
output_file: str = audio_convert(filename,
to_sample_rate=to_sample_rate,
sample_width=sample_width,
channels=channels,
engine=engine,
)
origin_audio_info: dict = get_audio_info(filename, engine="wave")
origin_audio_info = json.dumps(origin_audio_info, ensure_ascii=False, indent=4)
output_audio_info: dict = get_audio_info(output_file, engine="wave")
output_audio_info = json.dumps(output_audio_info, ensure_ascii=False, indent=4)
except Exception as e:
output_file = None
origin_audio_info = None
output_audio_info = None
message = f"failed. error type: {type(e)}, error text: {str(e)}"
return output_file, origin_audio_info, output_audio_info, message
def when_click_change_speech_speed(audio_t, speed: float = 1.0, engine: str = "librosa"):
sample_rate, signal = audio_t
filename = save_input_audio(sample_rate, signal)
message = "success"
try:
output_file: str = change_speech_speed(filename, speed, engine)
origin_audio_info: dict = get_audio_info(filename, engine="pydub")
origin_audio_info = json.dumps(origin_audio_info, ensure_ascii=False, indent=4)
output_audio_info: dict = get_audio_info(output_file, engine="pydub")
output_audio_info = json.dumps(output_audio_info, ensure_ascii=False, indent=4)
except Exception as e:
output_file = None
origin_audio_info = None
output_audio_info = None
message = f"failed. error type: {type(e)}, error text: {str(e)}"
return output_file, origin_audio_info, output_audio_info, message
def when_click_change_volume(audio_t: str,
radio: float = 1.0,
decibel: float = 0.0,
reference: str = None,
engine: str = "by_ffmpy_by_db",
):
sample_rate, signal = audio_t
filename = save_input_audio(sample_rate, signal)
message = "success"
try:
output_file: str = change_volume(filename, radio, decibel, reference, engine)
except Exception as e:
output_file = None
message = f"failed. error type: {type(e)}, error text: {str(e)}"
return output_file, message
def when_click_pad_audio(audio_t, pad_seconds: int = 10, pad_mode: str = "zero"):
sample_rate, signal = audio_t
message = "success"
pad_signal = signal
try:
if not signal.ndim == 1:
raise AssertionError
pad_length = int(pad_seconds * sample_rate)
if pad_mode == "zero":
pad = np.zeros(shape=(pad_length,), dtype=signal.dtype)
elif pad_mode == "repeat":
signal_length = len(signal)
if pad_length <= signal_length:
pad = signal[:pad_length]
else:
a = pad_length // signal_length
pad = np.concat([signal] * int(a + 1), axis=-1)
pad = pad[:pad_length]
else:
raise NotImplementedError
pad_signal = np.concat([signal, pad], axis=-1)
except Exception as e:
message = f"failed. error type: {type(e)}, error text: {str(e)}"
return (sample_rate, pad_signal), message
def when_click_reverb(audio_t, kwargs: str, engine: str):
sample_rate, signal = audio_t
message = "success"
try:
signal = np.array(signal / (1 << 15), dtype=np.float32)
kwargs = json.loads(kwargs)
reverberated_audio = reverb(
signal=signal,
sample_rate=sample_rate,
engine=engine,
**kwargs,
)
reverberated_audio = np.array(reverberated_audio * (1 << 15), dtype=np.int16)
except Exception as e:
reverberated_audio = None
message = f"failed. error type: {type(e)}, error text: {str(e)}"
return (sample_rate, reverberated_audio), message
def when_click_mix_speech_and_noise(speech_t, noise_t, snr_db: float):
sample_rate1, speech = speech_t
sample_rate2, noise = noise_t
message = "success"
mix_signal = speech
try:
if sample_rate1 != sample_rate2:
raise AssertionError(f"sr of speech: {sample_rate1}, sr of noise: {sample_rate2}")
if speech.dtype == np.int16:
speech = np.array(speech, dtype=np.float32)
speech /= (1 << 15)
else:
raise NotImplementedError
if noise.dtype == np.int16:
noise = np.array(noise, dtype=np.float32)
noise /= (1 << 15)
else:
raise NotImplementedError
mix_signal = mix_speech_and_noise(speech, noise, snr_db)
mix_signal = np.array(mix_signal * (1 << 15), dtype=np.int16)
except Exception as e:
message = f"failed. error type: {type(e)}, error text: {str(e)}"
# np.int16
return (sample_rate1, mix_signal), message
audio_convert_examples = [
[
(project_path / "data/examples/default/audio_0_2.wav").as_posix(),
8000, 2, "0", "librosa"
]
]
change_volume_examples = [
[
(project_path / "data/examples/default/audio_0_3_clone_from_audio_0_2.wav").as_posix(),
1.0, -10.0,
None,
"by_ffmpy_by_db"
],
[
(project_path / "data/examples/default/audio_0_3_clone_from_audio_0_2.wav").as_posix(),
0.3, 0.0,
None,
"by_ffmpy_by_radio"
],
[
(project_path / "data/examples/default/audio_0_3_clone_from_audio_0_2.wav").as_posix(),
1.0, -10.0,
None,
"by_pydub_by_db"
],
[
(project_path / "data/examples/default/audio_0_3_clone_from_audio_0_2.wav").as_posix(),
1.0, 0.0,
(project_path / "data/examples/default/audio_0_2.wav").as_posix(),
"by_pydub_by_reference"
]
]
pad_audio_examples = [
[
(project_path / "data/examples/default/audio_0_3_clone_from_audio_0_2.wav").as_posix(),
10, "zero",
],
]
reverb_examples = [
[
(project_path / "data/examples/default/audio_0_2.wav").as_posix(),
'{\n "room_size": 0.25,\n "damping": 0.5,\n "width": 1.0,\n "dry_level": 0.4,\n "wet_level": 0.6,\n "freeze_mode": false\n}',
"pedalboard",
],
[
(project_path / "data/examples/default/audio_0_2.wav").as_posix(),
'{\n "room_size": [4.0, 6.0],\n "source_position": [2.5, 4.5],\n "microphone_array": [\n [1.5, 1.5],\n [2.5, 1.5]\n ],\n "output_microphone_idx": 0\n}',
"pyroomacoustics",
]
]
mix_speech_and_noise_examples = [
[
(project_path / "data/examples/mix/speech/000f62f5-5b05-4494-a8db-0eaca3ebd871_th-TH_1678353399860.wav").as_posix(),
(project_path / "data/examples/mix/noise/000e2a2e-43c8-4752-8e26-34207fa6e9e4_th-TH_1678244573769.wav").as_posix(),
-5,
],
[
(project_path / "data/examples/mix/speech/0000c655-3a8e-4196-bc31-c01fa8d115cc_th-TH_1678768644585.wav").as_posix(),
(project_path / "data/examples/mix/noise/000f28d7-2129-49d5-9942-16ebf60e8285_th-TH_1678343313388.wav").as_posix(),
0,
],
[
(project_path / "data/examples/mix/speech/001df4d1-9f7a-4e78-adc9-ef26d07eba60_th-TH_1667878032.0303788.wav").as_posix(),
(project_path / "data/examples/mix/noise/0001f9f2-3626-427f-8ae5-105d81fcb5a3_th-TH_1678772646723.wav").as_posix(),
5,
],
[
(project_path / "data/examples/mix/speech/001ef59d-b266-4409-b89c-627e3d7fb27d_th-TH_1678356022482.wav").as_posix(),
(project_path / "data/examples/mix/noise/00240453-cd58-4059-9a38-d00583b879c7_th-TH_1678168729318.wav").as_posix(),
10,
]
]
def main():
args = get_args()
# examples
examples_dir = Path(args.examples_dir)
# choices
info_choices = list(info_engine_to_function.keys())
cvt_choices = list(cvt_engine_to_function.keys())
speed_choices = list(speed_engine_to_function.keys())
volume_choices = list(volume_engine_to_function.keys())
reverb_choices = list(reverb_engine_to_function.keys())
# ui
with gr.Blocks() as blocks:
with gr.Tabs():
with gr.TabItem("info"):
with gr.Row():
with gr.Column(variant="panel", scale=5):
info_audio = gr.Audio(label="audio")
info_engine = gr.Dropdown(choices=info_choices, value=info_choices[0], label="engine")
info_button = gr.Button(variant="primary")
with gr.Column(variant="panel", scale=5):
info_output = gr.Text(label="output")
info_log = gr.Text(label="log")
gr.Examples(
examples=[
[filename.as_posix(), "wave"]
for filename in examples_dir.glob("**/*.wav")
],
inputs=[info_audio, info_engine],
outputs=[info_output, info_log],
fn=when_click_get_audio_info,
)
info_button.click(
when_click_get_audio_info,
inputs=[info_audio, info_engine],
outputs=[info_output, info_log]
)
with gr.TabItem("convert"):
with gr.Row():
with gr.Column(variant="panel", scale=5):
cvt_audio = gr.Audio(label="audio")
with gr.Row():
cvt_sample_rate = gr.Dropdown(choices=[8000], value=8000, label="sample_rate")
cvt_sample_width = gr.Dropdown(choices=[2], value=2, label="sample_width")
cvt_channels = gr.Text(
value="0", label="channels",
info = "The channels to be retained, separated by commas, such as `0,1`"
)
cvt_engine = gr.Dropdown(choices=cvt_choices, value=cvt_choices[0], label="engine")
cvt_button = gr.Button(variant="primary")
with gr.Column(variant="panel", scale=5):
cvt_output_audio = gr.Audio(label="output_audio")
cvt_origin_audio_info = gr.Text(label="origin_audio_info")
cvt_output_audio_info = gr.Text(label="output_audio_info")
cvt_log = gr.Text(label="log")
gr.Examples(
examples=audio_convert_examples,
inputs=[
cvt_audio,
cvt_sample_rate, cvt_sample_width, cvt_channels,
cvt_engine,
],
outputs=[
cvt_output_audio,
cvt_origin_audio_info, cvt_output_audio_info,
cvt_log
],
fn=when_click_audio_convert,
)
cvt_button.click(
when_click_audio_convert,
inputs=[
cvt_audio,
cvt_sample_rate, cvt_sample_width, cvt_channels,
cvt_engine,
],
outputs=[
cvt_output_audio,
cvt_origin_audio_info, cvt_output_audio_info,
cvt_log
],
)
with gr.TabItem("speech_speed"):
with gr.Row():
with gr.Column(variant="panel", scale=5):
speech_speed_audio = gr.Audio(label="audio")
with gr.Row():
speech_speed_speed = gr.Slider(minimum=0.0, maximum=4.0, value=1.0, label="speed")
speech_speed_engine = gr.Dropdown(choices=speed_choices, value=speed_choices[0], label="engine")
speech_speed_button = gr.Button(variant="primary")
with gr.Column(variant="panel", scale=5):
speech_speed_output_audio = gr.Audio(label="output_audio")
speech_speed_origin_audio_info = gr.Text(label="origin_audio_info")
speech_speed_output_audio_info = gr.Text(label="output_audio_info")
speech_speed_log = gr.Text(label="log")
gr.Examples(
examples=[
[filename.as_posix(), 0.5]
for filename in examples_dir.glob("**/*.wav")
],
inputs=[speech_speed_audio, speech_speed_speed, speech_speed_engine],
outputs=[
speech_speed_output_audio,
speech_speed_origin_audio_info, speech_speed_output_audio_info,
speech_speed_log,
],
fn=when_click_change_speech_speed,
)
speech_speed_button.click(
when_click_change_speech_speed,
inputs=[speech_speed_audio, speech_speed_speed, speech_speed_engine],
outputs=[
speech_speed_output_audio,
speech_speed_origin_audio_info, speech_speed_output_audio_info,
speech_speed_log,
]
)
with gr.TabItem("volume"):
with gr.Row():
with gr.Column(variant="panel", scale=5):
volume_speed_audio = gr.Audio(label="audio")
with gr.Row():
with gr.Column():
volume_radio = gr.Slider(minimum=0.0, maximum=3.0, value=1.0, step=0.1, label="radio")
volume_decibel = gr.Slider(minimum=-30.0, maximum=30.0, value=0.0, step=0.1, label="decibel")
volume_engine = gr.Dropdown(choices=volume_choices, value=volume_choices[0], label="engine")
with gr.Column():
volume_reference = gr.File(label="reference")
volume_button = gr.Button(variant="primary")
with gr.Column(variant="panel", scale=5):
volume_output_audio = gr.Audio(label="output_audio")
volume_log = gr.Text(label="log")
gr.Examples(
examples=change_volume_examples,
inputs=[volume_speed_audio, volume_radio, volume_decibel, volume_reference, volume_engine],
outputs=[
volume_output_audio,
volume_log,
],
fn=when_click_change_volume,
)
volume_button.click(
when_click_change_volume,
inputs=[volume_speed_audio, volume_radio, volume_decibel, volume_reference, volume_engine],
outputs=[
volume_output_audio,
volume_log,
]
)
with gr.TabItem("pad"):
with gr.Row():
with gr.Column(variant="panel", scale=5):
pad_audio = gr.Audio(label="audio")
with gr.Row():
pad_seconds = gr.Slider(minimum=0, maximum=100, value=20, step=0.1, label="pad_seconds")
pad_mode = gr.Dropdown(choices=["zero", "repeat"], value="zero", label="pad_mode")
pad_button = gr.Button(variant="primary")
with gr.Column(variant="panel", scale=5):
pad_output_audio = gr.Audio(label="output_audio")
pad_log = gr.Text(label="log")
gr.Examples(
examples=pad_audio_examples,
inputs=[pad_audio, pad_seconds, pad_mode],
outputs=[
pad_output_audio, pad_log
],
fn=when_click_pad_audio,
)
pad_button.click(
when_click_pad_audio,
inputs=[pad_audio, pad_seconds, pad_mode],
outputs=[
pad_output_audio, pad_log
],
)
with gr.TabItem("reverb"):
with gr.Row():
with gr.Column(variant="panel", scale=5):
reverb_audio = gr.Audio(label="audio")
reverb_kwargs = gr.Textbox(lines=8, label="kwargs")
reverb_engine = gr.Dropdown(choices=reverb_choices, value=reverb_choices[0], label="engine")
reverb_button = gr.Button(variant="primary")
with gr.Column(variant="panel", scale=5):
reverb_output_audio = gr.Audio(label="output_audio")
reverb_log = gr.Text(label="log")
gr.Examples(
examples=reverb_examples,
inputs=[reverb_audio, reverb_kwargs, reverb_engine],
outputs=[
reverb_output_audio, reverb_log
],
fn=when_click_reverb,
)
reverb_button.click(
when_click_reverb,
inputs=[reverb_audio, reverb_kwargs, reverb_engine],
outputs=[
reverb_output_audio, reverb_log
],
)
with gr.TabItem("mix"):
with gr.Row():
with gr.Column(variant="panel", scale=5):
mix_speed_audio = gr.Audio(label="speech")
mix_noise_audio = gr.Audio(label="noise")
with gr.Row():
mix_snr_db = gr.Slider(minimum=-10, maximum=20, value=10, step=0.1, label="snr_db")
mix_button = gr.Button(variant="primary")
with gr.Column(variant="panel", scale=5):
mix_output_audio = gr.Audio(label="output_audio")
mix_log = gr.Text(label="log")
gr.Examples(
examples=mix_speech_and_noise_examples,
inputs=[mix_speed_audio, mix_noise_audio, mix_snr_db],
outputs=[
mix_output_audio, mix_log
],
fn=when_click_mix_speech_and_noise,
)
mix_button.click(
when_click_mix_speech_and_noise,
inputs=[mix_speed_audio, mix_noise_audio, mix_snr_db],
outputs=[
mix_output_audio, mix_log
],
)
# http://127.0.0.1:7860/
blocks.queue().launch(
share=False if platform.system() == "Windows" else False,
# server_name="127.0.0.1" if platform.system() == "Windows" else "0.0.0.0",
server_name="0.0.0.0",
server_port=7860,
)
return
if __name__ == "__main__":
main()