File size: 5,928 Bytes
7ee3434 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 |
# Copyright (c) 2023 Amphion.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import os
import numpy as np
import torch
import torchaudio
def save_feature(process_dir, feature_dir, item, feature, overrides=True):
"""Save features to path
Args:
process_dir (str): directory to store features
feature_dir (_type_): directory to store one type of features (mel, energy, ...)
item (str): uid
feature (tensor): feature tensor
overrides (bool, optional): whether to override existing files. Defaults to True.
"""
process_dir = os.path.join(process_dir, feature_dir)
os.makedirs(process_dir, exist_ok=True)
out_path = os.path.join(process_dir, item + ".npy")
if os.path.exists(out_path):
if overrides:
np.save(out_path, feature)
else:
np.save(out_path, feature)
def save_txt(process_dir, feature_dir, item, feature, overrides=True):
process_dir = os.path.join(process_dir, feature_dir)
os.makedirs(process_dir, exist_ok=True)
out_path = os.path.join(process_dir, item + ".txt")
if os.path.exists(out_path):
if overrides:
f = open(out_path, "w")
f.writelines(feature)
f.close()
else:
f = open(out_path, "w")
f.writelines(feature)
f.close()
def save_audio(path, waveform, fs, add_silence=False, turn_up=False, volume_peak=0.9):
"""Save audio to path with processing (turn up volume, add silence)
Args:
path (str): path to save audio
waveform (numpy array): waveform to save
fs (int): sampling rate
add_silence (bool, optional): whether to add silence to beginning and end. Defaults to False.
turn_up (bool, optional): whether to turn up volume. Defaults to False.
volume_peak (float, optional): volume peak. Defaults to 0.9.
"""
if turn_up:
# continue to turn up to volume_peak
ratio = volume_peak / max(waveform.max(), abs(waveform.min()))
waveform = waveform * ratio
if add_silence:
silence_len = fs // 20
silence = np.zeros((silence_len,), dtype=waveform.dtype)
result = np.concatenate([silence, waveform, silence])
waveform = result
waveform = torch.as_tensor(waveform, dtype=torch.float32, device="cpu")
if len(waveform.size()) == 1:
waveform = waveform[None, :]
elif waveform.size(0) != 1:
# Stereo to mono
waveform = torch.mean(waveform, dim=0, keepdim=True)
torchaudio.save(path, waveform, fs, encoding="PCM_S", bits_per_sample=16)
def save_torch_audio(process_dir, feature_dir, item, wav_torch, fs, overrides=True):
"""Save torch audio to path without processing
Args:
process_dir (str): directory to store features
feature_dir (_type_): directory to store one type of features (mel, energy, ...)
item (str): uid
wav_torch (tensor): feature tensor
fs (int): sampling rate
overrides (bool, optional): whether to override existing files. Defaults to True.
"""
if wav_torch.shape != 2:
wav_torch = wav_torch.unsqueeze(0)
process_dir = os.path.join(process_dir, feature_dir)
os.makedirs(process_dir, exist_ok=True)
out_path = os.path.join(process_dir, item + ".wav")
torchaudio.save(out_path, wav_torch, fs)
async def async_load_audio(path, sample_rate: int = 24000):
r"""
Args:
path: The source loading path.
sample_rate: The target sample rate, will automatically resample if necessary.
Returns:
waveform: The waveform object. Should be [1 x sequence_len].
"""
async def use_torchaudio_load(path):
return torchaudio.load(path)
waveform, sr = await use_torchaudio_load(path)
waveform = torch.mean(waveform, dim=0, keepdim=True)
if sr != sample_rate:
waveform = torchaudio.functional.resample(waveform, sr, sample_rate)
if torch.any(torch.isnan(waveform) or torch.isinf(waveform)):
raise ValueError("NaN or Inf found in waveform.")
return waveform
async def async_save_audio(
path,
waveform,
sample_rate: int = 24000,
add_silence: bool = False,
volume_peak: float = 0.9,
):
r"""
Args:
path: The target saving path.
waveform: The waveform object. Should be [n_channel x sequence_len].
sample_rate: Sample rate.
add_silence: If ``true``, concat 0.05s silence to beginning and end.
volume_peak: Turn up volume for larger number, vice versa.
"""
async def use_torchaudio_save(path, waveform, sample_rate):
torchaudio.save(
path, waveform, sample_rate, encoding="PCM_S", bits_per_sample=16
)
waveform = torch.as_tensor(waveform, device="cpu", dtype=torch.float32)
shape = waveform.size()[:-1]
ratio = abs(volume_peak) / max(waveform.max(), abs(waveform.min()))
waveform = waveform * ratio
if add_silence:
silence_len = sample_rate // 20
silence = torch.zeros((*shape, silence_len), dtype=waveform.type())
waveform = torch.concatenate((silence, waveform, silence), dim=-1)
if waveform.dim() == 1:
waveform = waveform[None]
await use_torchaudio_save(path, waveform, sample_rate)
def load_mel_extrema(cfg, dataset_name, split):
dataset_dir = os.path.join(
cfg.OUTPUT_PATH,
"preprocess/{}_version".format(cfg.data.process_version),
dataset_name,
)
min_file = os.path.join(
dataset_dir,
"mel_min_max",
split.split("_")[-1],
"mel_min.npy",
)
max_file = os.path.join(
dataset_dir,
"mel_min_max",
split.split("_")[-1],
"mel_max.npy",
)
mel_min = np.load(min_file)
mel_max = np.load(max_file)
return mel_min, mel_max
|