Spaces:
Sleeping
Sleeping
File size: 5,278 Bytes
6557f75 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
import math
import torch
import warnings
# https://github.com/pytorch/audio/blob/d9942bae249329bd8c8bf5c92f0f108595fcb84f/torchaudio/functional/functional.py#L495
def _create_triangular_filterbank(
all_freqs: torch.Tensor,
f_pts: torch.Tensor,
) -> torch.Tensor:
"""Create a triangular filter bank.
Args:
all_freqs (Tensor): STFT freq points of size (`n_freqs`).
f_pts (Tensor): Filter mid points of size (`n_filter`).
Returns:
fb (Tensor): The filter bank of size (`n_freqs`, `n_filter`).
"""
# Adopted from Librosa
# calculate the difference between each filter mid point and each stft freq point in hertz
f_diff = f_pts[1:] - f_pts[:-1] # (n_filter + 1)
slopes = f_pts.unsqueeze(0) - all_freqs.unsqueeze(1) # (n_freqs, n_filter + 2)
# create overlapping triangles
zero = torch.zeros(1)
down_slopes = (-1.0 * slopes[:, :-2]) / f_diff[:-1] # (n_freqs, n_filter)
up_slopes = slopes[:, 2:] / f_diff[1:] # (n_freqs, n_filter)
fb = torch.max(zero, torch.min(down_slopes, up_slopes))
return fb
# https://github.com/pytorch/audio/blob/d9942bae249329bd8c8bf5c92f0f108595fcb84f/torchaudio/prototype/functional/functional.py#L6
def _hz_to_bark(freqs: float, bark_scale: str = "traunmuller") -> float:
r"""Convert Hz to Barks.
Args:
freqs (float): Frequencies in Hz
bark_scale (str, optional): Scale to use: ``traunmuller``, ``schroeder`` or ``wang``. (Default: ``traunmuller``)
Returns:
barks (float): Frequency in Barks
"""
if bark_scale not in ["schroeder", "traunmuller", "wang"]:
raise ValueError(
'bark_scale should be one of "schroeder", "traunmuller" or "wang".'
)
if bark_scale == "wang":
return 6.0 * math.asinh(freqs / 600.0)
elif bark_scale == "schroeder":
return 7.0 * math.asinh(freqs / 650.0)
# Traunmuller Bark scale
barks = ((26.81 * freqs) / (1960.0 + freqs)) - 0.53
# Bark value correction
if barks < 2:
barks += 0.15 * (2 - barks)
elif barks > 20.1:
barks += 0.22 * (barks - 20.1)
return barks
def _bark_to_hz(barks: torch.Tensor, bark_scale: str = "traunmuller") -> torch.Tensor:
"""Convert bark bin numbers to frequencies.
Args:
barks (torch.Tensor): Bark frequencies
bark_scale (str, optional): Scale to use: ``traunmuller``,``schroeder`` or ``wang``. (Default: ``traunmuller``)
Returns:
freqs (torch.Tensor): Barks converted in Hz
"""
if bark_scale not in ["schroeder", "traunmuller", "wang"]:
raise ValueError(
'bark_scale should be one of "traunmuller", "schroeder" or "wang".'
)
if bark_scale == "wang":
return 600.0 * torch.sinh(barks / 6.0)
elif bark_scale == "schroeder":
return 650.0 * torch.sinh(barks / 7.0)
# Bark value correction
if any(barks < 2):
idx = barks < 2
barks[idx] = (barks[idx] - 0.3) / 0.85
elif any(barks > 20.1):
idx = barks > 20.1
barks[idx] = (barks[idx] + 4.422) / 1.22
# Traunmuller Bark scale
freqs = 1960 * ((barks + 0.53) / (26.28 - barks))
return freqs
def _hz_to_octs(freqs, tuning=0.0, bins_per_octave=12):
a440 = 440.0 * 2.0 ** (tuning / bins_per_octave)
return torch.log2(freqs / (a440 / 16))
def barkscale_fbanks(
n_freqs: int,
f_min: float,
f_max: float,
n_barks: int,
sample_rate: int,
bark_scale: str = "traunmuller",
) -> torch.Tensor:
r"""Create a frequency bin conversion matrix.
.. devices:: CPU
.. properties:: TorchScript
.. image:: https://download.pytorch.org/torchaudio/doc-assets/bark_fbanks.png
:alt: Visualization of generated filter bank
Args:
n_freqs (int): Number of frequencies to highlight/apply
f_min (float): Minimum frequency (Hz)
f_max (float): Maximum frequency (Hz)
n_barks (int): Number of mel filterbanks
sample_rate (int): Sample rate of the audio waveform
bark_scale (str, optional): Scale to use: ``traunmuller``,``schroeder`` or ``wang``. (Default: ``traunmuller``)
Returns:
torch.Tensor: Triangular filter banks (fb matrix) of size (``n_freqs``, ``n_barks``)
meaning number of frequencies to highlight/apply to x the number of filterbanks.
Each column is a filterbank so that assuming there is a matrix A of
size (..., ``n_freqs``), the applied result would be
``A * barkscale_fbanks(A.size(-1), ...)``.
"""
# freq bins
all_freqs = torch.linspace(0, sample_rate // 2, n_freqs)
# calculate bark freq bins
m_min = _hz_to_bark(f_min, bark_scale=bark_scale)
m_max = _hz_to_bark(f_max, bark_scale=bark_scale)
m_pts = torch.linspace(m_min, m_max, n_barks + 2)
f_pts = _bark_to_hz(m_pts, bark_scale=bark_scale)
# create filterbank
fb = _create_triangular_filterbank(all_freqs, f_pts)
if (fb.max(dim=0).values == 0.0).any():
warnings.warn(
"At least one bark filterbank has all zero values. "
f"The value for `n_barks` ({n_barks}) may be set too high. "
f"Or, the value for `n_freqs` ({n_freqs}) may be set too low."
)
return fb |