nguyenvulebinh
/

speaker-embedding

Model card Files Files and versions

xet

Community

nguyenvulebinh commited on Oct 18, 2023

Commit

fd06d88

1 Parent(s): 847e2fc

upload infer utils

Browse files

Files changed (2) hide show

wav2filterbank.py +313 -0
xvector_sincnet.py +223 -0

wav2filterbank.py ADDED Viewed

	@@ -0,0 +1,313 @@

+import librosa
+import numpy as np
+import torch
+import torch.nn as nn
+import logging
+import math
+import random
+CONSTANT = 1e-5
+def normalize_batch(x, seq_len, normalize_type):
+    x_mean = None
+    x_std = None
+    if normalize_type == "per_feature":
+        x_mean = torch.zeros((seq_len.shape[0], x.shape[1]), dtype=x.dtype, device=x.device)
+        x_std = torch.zeros((seq_len.shape[0], x.shape[1]), dtype=x.dtype, device=x.device)
+        for i in range(x.shape[0]):
+            if x[i, :, : seq_len[i]].shape[1] == 1:
+                raise ValueError(
+                    "normalize_batch with `per_feature` normalize_type received a tensor of length 1. This will result "
+                    "in torch.std() returning nan. Make sure your audio length has enough samples for a single "
+                    "feature (ex. at least `hop_length` for Mel Spectrograms)."
+                )
+            x_mean[i, :] = x[i, :, : seq_len[i]].mean(dim=1)
+            x_std[i, :] = x[i, :, : seq_len[i]].std(dim=1)
+        # make sure x_std is not zero
+        x_std += CONSTANT
+        return (x - x_mean.unsqueeze(2)) / x_std.unsqueeze(2), x_mean, x_std
+    elif normalize_type == "all_features":
+        x_mean = torch.zeros(seq_len.shape, dtype=x.dtype, device=x.device)
+        x_std = torch.zeros(seq_len.shape, dtype=x.dtype, device=x.device)
+        for i in range(x.shape[0]):
+            x_mean[i] = x[i, :, : seq_len[i].item()].mean()
+            x_std[i] = x[i, :, : seq_len[i].item()].std()
+        # make sure x_std is not zero
+        x_std += CONSTANT
+        return (x - x_mean.view(-1, 1, 1)) / x_std.view(-1, 1, 1), x_mean, x_std
+    elif "fixed_mean" in normalize_type and "fixed_std" in normalize_type:
+        x_mean = torch.tensor(normalize_type["fixed_mean"], device=x.device)
+        x_std = torch.tensor(normalize_type["fixed_std"], device=x.device)
+        return (
+            (x - x_mean.view(x.shape[0], x.shape[1]).unsqueeze(2)) / x_std.view(x.shape[0], x.shape[1]).unsqueeze(2),
+            x_mean,
+            x_std,
+        )
+    else:
+        return x, x_mean, x_std
+def splice_frames(x, frame_splicing):
+    """ Stacks frames together across feature dim
+    input is batch_size, feature_dim, num_frames
+    output is batch_size, feature_dim*frame_splicing, num_frames
+    """
+    seq = [x]
+    for n in range(1, frame_splicing):
+        seq.append(torch.cat([x[:, :, :n], x[:, :, n:]], dim=2))
+    return torch.cat(seq, dim=1)
+class FilterbankFeatures(nn.Module):
+    """Featurizer that converts wavs to Mel Spectrograms.
+    See AudioToMelSpectrogramPreprocessor for args.
+    "normalize": "per_feature",
+    "window_size": 0.025,
+    "sample_rate": 16000,
+    "window_stride": 0.01,
+    "window": "hann",
+    "features": 80,
+    "n_fft": 512,
+    "frame_splicing": 1,
+    "dither": 1e-05
+    n_window_size=window_size * sample_rate,
+    n_window_stride = window_stride * sample_rate,
+    """
+    def __init__(
+        self,
+        sample_rate=16000,
+        n_window_size=400,
+        n_window_stride=160,
+        window="hann",
+        normalize="per_feature",
+        n_fft=512,
+        preemph=0.97,
+        nfilt=80,
+        lowfreq=0,
+        highfreq=None,
+        log=True,
+        log_zero_guard_type="add",
+        log_zero_guard_value=2 ** -24,
+        dither=CONSTANT,
+        pad_to=16,
+        max_duration=16.7,
+        frame_splicing=1,
+        exact_pad=False,
+        pad_value=0,
+        mag_power=2.0,
+        use_grads=False,
+        rng=None,
+        nb_augmentation_prob=0.0,
+        nb_max_freq=4000,
+        stft_exact_pad=False,  # Deprecated arguments; kept for config compatibility
+        stft_conv=False,  # Deprecated arguments; kept for config compatibility
+    ):
+        super().__init__()
+        if stft_conv or stft_exact_pad:
+            logging.warning(
+                "Using torch_stft is deprecated and has been removed. The values have been forcibly set to False "
+                "for FilterbankFeatures and AudioToMelSpectrogramPreprocessor. Please set exact_pad to True "
+                "as needed."
+            )
+        if exact_pad and n_window_stride % 2 == 1:
+            raise NotImplementedError(
+                f"{self} received exact_pad == True, but hop_size was odd. If audio_length % hop_size == 0. Then the "
+                "returned spectrogram would not be of length audio_length // hop_size. Please use an even hop_size."
+            )
+        self.log_zero_guard_value = log_zero_guard_value
+        if (
+            n_window_size is None
+            or n_window_stride is None
+            or not isinstance(n_window_size, int)
+            or not isinstance(n_window_stride, int)
+            or n_window_size <= 0
+            or n_window_stride <= 0
+        ):
+            raise ValueError(
+                f"{self} got an invalid value for either n_window_size or "
+                f"n_window_stride. Both must be positive ints."
+            )
+        logging.info(f"PADDING: {pad_to}")
+        self.win_length = n_window_size
+        self.hop_length = n_window_stride
+        self.n_fft = n_fft or 2 ** math.ceil(math.log2(self.win_length))
+        self.stft_pad_amount = (self.n_fft - self.hop_length) // 2 if exact_pad else None
+        if exact_pad:
+            logging.info("STFT using exact pad")
+        torch_windows = {
+            'hann': torch.hann_window,
+            'hamming': torch.hamming_window,
+            'blackman': torch.blackman_window,
+            'bartlett': torch.bartlett_window,
+            'none': None,
+        }
+        window_fn = torch_windows.get(window, None)
+        window_tensor = window_fn(self.win_length, periodic=False) if window_fn else None
+        self.register_buffer("window", window_tensor)
+        self.stft = lambda x: torch.stft(
+            x,
+            n_fft=self.n_fft,
+            hop_length=self.hop_length,
+            win_length=self.win_length,
+            center=False if exact_pad else True,
+            window=self.window.to(dtype=torch.float),
+            return_complex=True,
+        )
+        self.normalize = normalize
+        self.log = log
+        self.dither = dither
+        self.frame_splicing = frame_splicing
+        self.nfilt = nfilt
+        self.preemph = preemph
+        self.pad_to = pad_to
+        highfreq = highfreq or sample_rate / 2
+        filterbanks = torch.tensor(
+            librosa.filters.mel(sr=sample_rate, n_fft=self.n_fft, n_mels=nfilt, fmin=lowfreq, fmax=highfreq),
+            dtype=torch.float,
+        ).unsqueeze(0)
+        self.register_buffer("fb", filterbanks)
+        # Calculate maximum sequence length
+        max_length = self.get_seq_len(torch.tensor(max_duration * sample_rate, dtype=torch.float))
+        max_pad = pad_to - (max_length % pad_to) if pad_to > 0 else 0
+        self.max_length = max_length + max_pad
+        self.pad_value = pad_value
+        self.mag_power = mag_power
+        # We want to avoid taking the log of zero
+        # There are two options: either adding or clamping to a small value
+        if log_zero_guard_type not in ["add", "clamp"]:
+            raise ValueError(
+                f"{self} received {log_zero_guard_type} for the "
+                f"log_zero_guard_type parameter. It must be either 'add' or "
+                f"'clamp'."
+            )
+        self.use_grads = use_grads
+        if not use_grads:
+            self.forward = torch.no_grad()(self.forward)
+        self._rng = random.Random() if rng is None else rng
+        self.nb_augmentation_prob = nb_augmentation_prob
+        if self.nb_augmentation_prob > 0.0:
+            if nb_max_freq >= sample_rate / 2:
+                self.nb_augmentation_prob = 0.0
+            else:
+                self._nb_max_fft_bin = int((nb_max_freq / sample_rate) * n_fft)
+        # log_zero_guard_value is the the small we want to use, we support
+        # an actual number, or "tiny", or "eps"
+        self.log_zero_guard_type = log_zero_guard_type
+        logging.debug(f"sr: {sample_rate}")
+        logging.debug(f"n_fft: {self.n_fft}")
+        logging.debug(f"win_length: {self.win_length}")
+        logging.debug(f"hop_length: {self.hop_length}")
+        logging.debug(f"n_mels: {nfilt}")
+        logging.debug(f"fmin: {lowfreq}")
+        logging.debug(f"fmax: {highfreq}")
+        logging.debug(f"using grads: {use_grads}")
+        logging.debug(f"nb_augmentation_prob: {nb_augmentation_prob}")
+    def log_zero_guard_value_fn(self, x):
+        if isinstance(self.log_zero_guard_value, str):
+            if self.log_zero_guard_value == "tiny":
+                return torch.finfo(x.dtype).tiny
+            elif self.log_zero_guard_value == "eps":
+                return torch.finfo(x.dtype).eps
+            else:
+                raise ValueError(
+                    f"{self} received {self.log_zero_guard_value} for the "
+                    f"log_zero_guard_type parameter. It must be either a "
+                    f"number, 'tiny', or 'eps'"
+                )
+        else:
+            return self.log_zero_guard_value
+    def get_seq_len(self, seq_len):
+        # Assuming that center is True is stft_pad_amount = 0
+        pad_amount = self.stft_pad_amount * 2 if self.stft_pad_amount is not None else self.n_fft // 2 * 2
+        seq_len = torch.floor((seq_len + pad_amount - self.n_fft) / self.hop_length) + 1
+        return seq_len.to(dtype=torch.long)
+    @property
+    def filter_banks(self):
+        return self.fb
+    def forward(self, x, seq_len, linear_spec=False):
+        seq_len = self.get_seq_len(seq_len.float())
+        if self.stft_pad_amount is not None:
+            x = torch.nn.functional.pad(
+                x.unsqueeze(1), (self.stft_pad_amount, self.stft_pad_amount), "reflect"
+            ).squeeze(1)
+        # dither (only in training mode for eval determinism)
+        if self.training and self.dither > 0:
+            x += self.dither * torch.randn_like(x)
+        # do preemphasis
+        if self.preemph is not None:
+            x = torch.cat((x[:, 0].unsqueeze(1), x[:, 1:] - self.preemph * x[:, :-1]), dim=1)
+        # disable autocast to get full range of stft values
+        with torch.cuda.amp.autocast(enabled=False):
+            x = self.stft(x)
+        # torch stft returns complex tensor (of shape [B,N,T]); so convert to magnitude
+        # guard is needed for sqrt if grads are passed through
+        guard = 0 if not self.use_grads else CONSTANT
+        x = torch.view_as_real(x)
+        x = torch.sqrt(x.pow(2).sum(-1) + guard)
+        if self.training and self.nb_augmentation_prob > 0.0:
+            for idx in range(x.shape[0]):
+                if self._rng.random() < self.nb_augmentation_prob:
+                    x[idx, self._nb_max_fft_bin :, :] = 0.0
+        # get power spectrum
+        if self.mag_power != 1.0:
+            x = x.pow(self.mag_power)
+        # return plain spectrogram if required
+        if linear_spec:
+            return x, seq_len
+        # dot with filterbank energies
+        x = torch.matmul(self.fb.to(x.dtype), x)
+        # log features if required
+        if self.log:
+            if self.log_zero_guard_type == "add":
+                x = torch.log(x + self.log_zero_guard_value_fn(x))
+            elif self.log_zero_guard_type == "clamp":
+                x = torch.log(torch.clamp(x, min=self.log_zero_guard_value_fn(x)))
+            else:
+                raise ValueError("log_zero_guard_type was not understood")
+        # frame splicing if required
+        if self.frame_splicing > 1:
+            x = splice_frames(x, self.frame_splicing)
+        # normalize if required
+        if self.normalize:
+            x, _, _ = normalize_batch(x, seq_len, normalize_type=self.normalize)
+        # mask to zero any values beyond seq_len in batch, pad to multiple of `pad_to` (for efficiency)
+        max_len = x.size(-1)
+        mask = torch.arange(max_len).to(x.device)
+        mask = mask.repeat(x.size(0), 1) >= seq_len.unsqueeze(1)
+        x = x.masked_fill(mask.unsqueeze(1).type(torch.bool).to(device=x.device), self.pad_value)
+        del mask
+        pad_to = self.pad_to
+        if pad_to == "max":
+            x = nn.functional.pad(x, (0, self.max_length - x.size(-1)), value=self.pad_value)
+        elif pad_to > 0:
+            pad_amt = x.size(-1) % pad_to
+            if pad_amt != 0:
+                x = nn.functional.pad(x, (0, pad_to - pad_amt), value=self.pad_value)
+        return x, seq_len

xvector_sincnet.py ADDED Viewed

	@@ -0,0 +1,223 @@

+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import warnings
+from asteroid_filterbanks import Encoder, ParamSincFB
+def merge_dict(defaults: dict, custom: dict = None):
+    params = dict(defaults)
+    if custom is not None:
+        params.update(custom)
+    return params
+class StatsPool(nn.Module):
+    """Statistics pooling
+    Compute temporal mean and (unbiased) standard deviation
+    and returns their concatenation.
+    Reference
+    ---------
+    https://en.wikipedia.org/wiki/Weighted_arithmetic_mean
+    """
+    def forward(
+        self, sequences: torch.Tensor, weights: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        """Forward pass
+        Parameters
+        ----------
+        sequences : (batch, channel, frames) torch.Tensor
+            Sequences.
+        weights : (batch, frames) torch.Tensor, optional
+            When provided, compute weighted mean and standard deviation.
+        Returns
+        -------
+        output : (batch, 2 * channel) torch.Tensor
+            Concatenation of mean and (unbiased) standard deviation.
+        """
+        if weights is None:
+            mean = sequences.mean(dim=2)
+            std = sequences.std(dim=2, unbiased=True)
+        else:
+            weights = weights.unsqueeze(dim=1)
+            # (batch, 1, frames)
+            num_frames = sequences.shape[2]
+            num_weights = weights.shape[2]
+            if num_frames != num_weights:
+                warnings.warn(
+                    f"Mismatch between frames ({num_frames}) and weights ({num_weights}) numbers."
+                )
+                weights = F.interpolate(
+                    weights, size=num_frames, mode="linear", align_corners=False
+                )
+            v1 = weights.sum(dim=2)
+            mean = torch.sum(sequences * weights, dim=2) / v1
+            dx2 = torch.square(sequences - mean.unsqueeze(2))
+            v2 = torch.square(weights).sum(dim=2)
+            var = torch.sum(dx2 * weights, dim=2) / (v1 - v2 / v1)
+            std = torch.sqrt(var)
+        return torch.cat([mean, std], dim=1)
+class SincNet(nn.Module):
+    def __init__(self, sample_rate: int = 16000, stride: int = 1):
+        super().__init__()
+        if sample_rate != 16000:
+            raise NotImplementedError("PyanNet only supports 16kHz audio for now.")
+            # TODO: add support for other sample rate. it should be enough to multiply
+            # kernel_size by (sample_rate / 16000). but this needs to be double-checked.
+        self.stride = stride
+        self.wav_norm1d = nn.InstanceNorm1d(1, affine=True)
+        self.conv1d = nn.ModuleList()
+        self.pool1d = nn.ModuleList()
+        self.norm1d = nn.ModuleList()
+        self.conv1d.append(
+            Encoder(
+                ParamSincFB(
+                    80,
+                    251,
+                    stride=self.stride,
+                    sample_rate=sample_rate,
+                    min_low_hz=50,
+                    min_band_hz=50,
+                )
+            )
+        )
+        self.pool1d.append(nn.MaxPool1d(3, stride=3, padding=0, dilation=1))
+        self.norm1d.append(nn.InstanceNorm1d(80, affine=True))
+        self.conv1d.append(nn.Conv1d(80, 60, 5, stride=1))
+        self.pool1d.append(nn.MaxPool1d(3, stride=3, padding=0, dilation=1))
+        self.norm1d.append(nn.InstanceNorm1d(60, affine=True))
+        self.conv1d.append(nn.Conv1d(60, 60, 5, stride=1))
+        self.pool1d.append(nn.MaxPool1d(3, stride=3, padding=0, dilation=1))
+        self.norm1d.append(nn.InstanceNorm1d(60, affine=True))
+    def forward(self, waveforms: torch.Tensor) -> torch.Tensor:
+        """Pass forward
+        Parameters
+        ----------
+        waveforms : (batch, channel, sample)
+        """
+        outputs = self.wav_norm1d(waveforms)
+        for c, (conv1d, pool1d, norm1d) in enumerate(
+            zip(self.conv1d, self.pool1d, self.norm1d)
+        ):
+            outputs = conv1d(outputs)
+            # https://github.com/mravanelli/SincNet/issues/4
+            if c == 0:
+                outputs = torch.abs(outputs)
+            outputs = F.leaky_relu(norm1d(pool1d(outputs)))
+        return outputs
+class XVectorSincNet(nn.Module):
+    SINCNET_DEFAULTS = {"stride": 10}
+    def __init__(
+        self,
+        sample_rate: int = 16000,
+        # num_channels: int = 1,
+        sincnet: dict = dict(
+            stride=10,
+            sample_rate=16000
+        ),
+        dimension: int = 512,
+        # task: Optional[Task] = None,
+    ):
+        super(XVectorSincNet, self).__init__()
+        sincnet = merge_dict(self.SINCNET_DEFAULTS, sincnet)
+        sincnet["sample_rate"] = sample_rate
+        # self.save_hyperparameters("sincnet", "dimension")
+        self.sincnet = SincNet(**sincnet)
+        in_channel = 60
+        self.tdnns = nn.ModuleList()
+        out_channels = [512, 512, 512, 512, 1500]
+        kernel_sizes = [5, 3, 3, 1, 1]
+        dilations = [1, 2, 3, 1, 1]
+        for out_channel, kernel_size, dilation in zip(
+            out_channels, kernel_sizes, dilations
+        ):
+            self.tdnns.extend(
+                [
+                    nn.Conv1d(
+                        in_channels=in_channel,
+                        out_channels=out_channel,
+                        kernel_size=kernel_size,
+                        dilation=dilation,
+                    ),
+                    nn.LeakyReLU(),
+                    nn.BatchNorm1d(out_channel),
+                ]
+            )
+            in_channel = out_channel
+        self.stats_pool = StatsPool()
+        self.embedding = nn.Linear(in_channel * 2, dimension)
+    def forward(
+        self, waveforms: torch.Tensor, weights: torch.Tensor = None
+    ) -> torch.Tensor:
+        """
+        Parameters
+        ----------
+        waveforms : torch.Tensor
+            Batch of waveforms with shape (batch, channel, sample)
+        weights : torch.Tensor, optional
+            Batch of weights with shape (batch, frame).
+        """
+        outputs = self.sincnet(waveforms).squeeze(dim=1)
+        for tdnn in self.tdnns:
+            outputs = tdnn(outputs)
+        outputs = self.stats_pool(outputs, weights=weights)
+        return self.embedding(outputs)
+""" Load model
+def cal_xvector_sincnet_embedding(xvector_model, ref_wav, max_length=5, sr=16000):
+    wavs = []
+    for i in range(0, len(ref_wav), max_length*sr):
+        wav = ref_wav[i:i + max_length*sr]
+        wav = np.concatenate([wav, np.zeros(max(0, max_length * sr - len(wav)))])
+        wavs.append(wav)
+    wavs = torch.from_numpy(np.stack(wavs))
+    if use_gpu:
+        wavs = wavs.cuda()
+    embed = xvector_model(wavs.unsqueeze(1).float())
+    return torch.mean(embed, dim=0).detach().cpu()
+xvector_model = XVectorSincNet()
+model_file = "model-bin/speaker_embedding/xvector_sincnet.pt"
+meta = torch.load(model_file, map_location='cpu')['state_dict']
+print('load_xvector_sincnet_model', xvector_model.load_state_dict(meta, strict=False))
+xvector_model = xvector_model.eval()
+for param in xvector_model.parameters():
+    param.requires_grad = False
+"""