File size: 6,198 Bytes
c968fc3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 |
# Source: https://github.com/microsoft/DNS-Challenge/tree/master/DNSMOS
#
# Copyright (c) 2022 Microsoft
#
# This code is licensed under the Creative Commons Attribution 4.0 International (CC BY 4.0) license.
# The full license text is available at the root of the source repository.
#
# Note: This code has been modified to fit the context of this repository.
# This code is included in an MIT-licensed repository.
# The repository's MIT license does not apply to this code.
import os
import librosa
import numpy as np
import onnxruntime as ort
import pandas as pd
import tqdm
import warnings
warnings.filterwarnings("ignore")
SAMPLING_RATE = 16000
INPUT_LENGTH = 9.01
class ComputeScore:
"""
ComputeScore class for evaluating DNSMOS.
"""
def __init__(self, primary_model_path, device="cpu") -> None:
"""
Initialize the ComputeScore object.
Args:
primary_model_path (str): Path to the primary model.
device (str): Device to run the models on ('cpu' or 'cuda').
Returns:
None
Raises:
RuntimeError: If the device is not supported.
"""
if device == "cuda":
self.onnx_sess = ort.InferenceSession(
primary_model_path, providers=["CUDAExecutionProvider"]
)
print("Using CUDA:", self.onnx_sess.get_providers())
else:
self.onnx_sess = ort.InferenceSession(primary_model_path)
def audio_melspec(
self, audio, n_mels=120, frame_size=320, hop_length=160, sr=16000, to_db=True
):
"""
Compute the mel spectrogram of an audio signal.
Args:
audio (np.ndarray): Input audio signal.
n_mels (int): Number of mel bands.
frame_size (int): Size of the FFT window.
hop_length (int): Number of samples between successive frames.
sr (int): Sampling rate.
to_db (bool): Whether to convert the power spectrogram to decibel units.
Returns:
np.ndarray: Mel spectrogram.
"""
mel_spec = librosa.feature.melspectrogram(
y=audio, sr=sr, n_fft=frame_size + 1, hop_length=hop_length, n_mels=n_mels
)
if to_db:
mel_spec = (librosa.power_to_db(mel_spec, ref=np.max) + 40) / 40
return mel_spec.T
def get_polyfit_val(self, sig, bak, ovr, is_personalized_MOS):
"""
Apply polynomial fitting to MOS scores.
Args:
sig (float): Signal MOS score.
bak (float): Background MOS score.
ovr (float): Overall MOS score.
is_personalized_MOS (bool): Flag for personalized MOS.
Returns:
tuple: Tuple containing the adjusted signal, background, and overall MOS scores.
"""
if is_personalized_MOS:
p_ovr = np.poly1d([-0.00533021, 0.005101, 1.18058466, -0.11236046])
p_sig = np.poly1d([-0.01019296, 0.02751166, 1.19576786, -0.24348726])
p_bak = np.poly1d([-0.04976499, 0.44276479, -0.1644611, 0.96883132])
else:
p_ovr = np.poly1d([-0.06766283, 1.11546468, 0.04602535])
p_sig = np.poly1d([-0.08397278, 1.22083953, 0.0052439])
p_bak = np.poly1d([-0.13166888, 1.60915514, -0.39604546])
sig_poly = p_sig(sig)
bak_poly = p_bak(bak)
ovr_poly = p_ovr(ovr)
return sig_poly, bak_poly, ovr_poly
def __call__(self, audio, sampling_rate, is_personalized_MOS):
"""
Compute DNSMOS scores for an audio signal.
Args:
audio (np.ndarray or str): Input audio signal or path to audio file.
sampling_rate (int): Sampling rate of the input audio.
is_personalized_MOS (bool): Flag for personalized MOS.
Returns:
dict: Dictionary containing MOS scores.
Raises:
ValueError: If the input audio is not valid.
"""
fs = SAMPLING_RATE
if isinstance(audio, str):
audio, _ = librosa.load(audio, sr=fs)
elif sampling_rate != fs:
# resample audio
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=fs)
actual_audio_len = len(audio)
len_samples = int(INPUT_LENGTH * fs)
while len(audio) < len_samples:
audio = np.append(audio, audio)
num_hops = int(np.floor(len(audio) / fs) - INPUT_LENGTH) + 1
hop_len_samples = fs
predicted_mos_sig_seg_raw = []
predicted_mos_bak_seg_raw = []
predicted_mos_ovr_seg_raw = []
predicted_mos_sig_seg = []
predicted_mos_bak_seg = []
predicted_mos_ovr_seg = []
for idx in range(num_hops):
audio_seg = audio[
int(idx * hop_len_samples) : int((idx + INPUT_LENGTH) * hop_len_samples)
]
if len(audio_seg) < len_samples:
continue
input_features = np.array(audio_seg).astype("float32")[np.newaxis, :]
oi = {"input_1": input_features}
mos_sig_raw, mos_bak_raw, mos_ovr_raw = self.onnx_sess.run(None, oi)[0][0]
mos_sig, mos_bak, mos_ovr = self.get_polyfit_val(
mos_sig_raw, mos_bak_raw, mos_ovr_raw, is_personalized_MOS
)
predicted_mos_sig_seg_raw.append(mos_sig_raw)
predicted_mos_bak_seg_raw.append(mos_bak_raw)
predicted_mos_ovr_seg_raw.append(mos_ovr_raw)
predicted_mos_sig_seg.append(mos_sig)
predicted_mos_bak_seg.append(mos_bak)
predicted_mos_ovr_seg.append(mos_ovr)
clip_dict = {
"filename": "audio_clip",
"len_in_sec": actual_audio_len / fs,
"sr": fs,
"num_hops": num_hops,
"OVRL_raw": np.mean(predicted_mos_ovr_seg_raw),
"SIG_raw": np.mean(predicted_mos_sig_seg_raw),
"BAK_raw": np.mean(predicted_mos_bak_seg_raw),
"OVRL": np.mean(predicted_mos_ovr_seg),
"SIG": np.mean(predicted_mos_sig_seg),
"BAK": np.mean(predicted_mos_bak_seg),
}
return clip_dict
|