File size: 3,165 Bytes
8c92a11 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
# Copyright (c) 2023 Amphion.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import math
import librosa
import torch
import numpy as np
from numpy import linalg as LA
from torchmetrics import PearsonCorrCoef
def extract_energy_pearson_coeffcients(
audio_ref,
audio_deg,
n_fft=1024,
hop_length=256,
win_length=1024,
**kwargs,
):
"""Compute Energy Pearson Coefficients between the predicted and the ground truth audio.
audio_ref: path to the ground truth audio.
audio_deg: path to the predicted audio.
fs: sampling rate.
n_fft: fft size.
hop_length: hop length.
win_length: window length.
method: "dtw" will use dtw algorithm to align the length of the ground truth and predicted audio.
"cut" will cut both audios into a same length according to the one with the shorter length.
db_scale: the ground truth and predicted audio will be converted to db_scale if "True".
"""
# Load hyperparameters
kwargs = kwargs["kwargs"]
fs = kwargs["fs"]
method = kwargs["method"]
db_scale = kwargs["db_scale"]
# Initialize method
pearson = PearsonCorrCoef()
# Load audio
if fs != None:
audio_ref, _ = librosa.load(audio_ref, sr=fs)
audio_deg, _ = librosa.load(audio_deg, sr=fs)
else:
audio_ref, fs = librosa.load(audio_ref)
audio_deg, fs = librosa.load(audio_deg)
# STFT
spec_ref = librosa.stft(
y=audio_ref, n_fft=n_fft, hop_length=hop_length, win_length=win_length
)
spec_deg = librosa.stft(
y=audio_deg, n_fft=n_fft, hop_length=hop_length, win_length=win_length
)
# Get magnitudes
mag_ref = np.abs(spec_ref).T
mag_deg = np.abs(spec_deg).T
# Convert spectrogram to energy
energy_ref = LA.norm(mag_ref, axis=1)
energy_deg = LA.norm(mag_deg, axis=1)
# Convert to db_scale
if db_scale:
energy_ref = 20 * np.log10(energy_ref)
energy_deg = 20 * np.log10(energy_deg)
# Audio length alignment
if method == "cut":
length = min(len(energy_ref), len(energy_deg))
energy_ref = energy_ref[:length]
energy_deg = energy_deg[:length]
elif method == "dtw":
_, wp = librosa.sequence.dtw(energy_ref, energy_deg, backtrack=True)
energy_gt_new = []
energy_pred_new = []
for i in range(wp.shape[0]):
gt_index = wp[i][0]
pred_index = wp[i][1]
energy_gt_new.append(energy_ref[gt_index])
energy_pred_new.append(energy_deg[pred_index])
energy_ref = np.array(energy_gt_new)
energy_deg = np.array(energy_pred_new)
assert len(energy_ref) == len(energy_deg)
# Convert to tensor
energy_ref = torch.from_numpy(energy_ref)
energy_deg = torch.from_numpy(energy_deg)
if torch.cuda.is_available():
device = torch.device("cuda")
energy_ref = energy_ref.to(device)
energy_deg = energy_deg.to(device)
pearson = pearson.to(device)
return pearson(energy_ref, energy_deg).detach().cpu().numpy().tolist()
|