Spaces:
Build error
Build error
Upload 9 files
Browse files- resemblyzer/__init__.py +6 -0
- resemblyzer/__pycache__/__init__.cpython-38.pyc +0 -0
- resemblyzer/__pycache__/audio.cpython-38.pyc +0 -0
- resemblyzer/__pycache__/hparams.cpython-38.pyc +0 -0
- resemblyzer/__pycache__/voice_encoder.cpython-38.pyc +0 -0
- resemblyzer/audio.py +108 -0
- resemblyzer/hparams.py +33 -0
- resemblyzer/pretrained.pt +3 -0
- resemblyzer/voice_encoder.py +177 -0
resemblyzer/__init__.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name = "resemblyzer"
|
2 |
+
|
3 |
+
from resemblyzer.audio import preprocess_wav, wav_to_mel_spectrogram, trim_long_silences, \
|
4 |
+
normalize_volume
|
5 |
+
from resemblyzer.hparams import sampling_rate
|
6 |
+
from resemblyzer.voice_encoder import VoiceEncoder
|
resemblyzer/__pycache__/__init__.cpython-38.pyc
ADDED
Binary file (444 Bytes). View file
|
|
resemblyzer/__pycache__/audio.cpython-38.pyc
ADDED
Binary file (3.8 kB). View file
|
|
resemblyzer/__pycache__/hparams.cpython-38.pyc
ADDED
Binary file (514 Bytes). View file
|
|
resemblyzer/__pycache__/voice_encoder.cpython-38.pyc
ADDED
Binary file (8.47 kB). View file
|
|
resemblyzer/audio.py
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from scipy.ndimage.morphology import binary_dilation
|
2 |
+
from resemblyzer.hparams import *
|
3 |
+
from pathlib import Path
|
4 |
+
from typing import Optional, Union
|
5 |
+
import numpy as np
|
6 |
+
import webrtcvad
|
7 |
+
import librosa
|
8 |
+
import struct
|
9 |
+
|
10 |
+
int16_max = (2 ** 15) - 1
|
11 |
+
|
12 |
+
|
13 |
+
def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray], source_sr: Optional[int]=None):
|
14 |
+
"""
|
15 |
+
Applies preprocessing operations to a waveform either on disk or in memory such that
|
16 |
+
The waveform will be resampled to match the data hyperparameters.
|
17 |
+
|
18 |
+
:param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not
|
19 |
+
just .wav), either the waveform as a numpy array of floats.
|
20 |
+
:param source_sr: if passing an audio waveform, the sampling rate of the waveform before
|
21 |
+
preprocessing. After preprocessing, the waveform'speaker sampling rate will match the data
|
22 |
+
hyperparameters. If passing a filepath, the sampling rate will be automatically detected and
|
23 |
+
this argument will be ignored.
|
24 |
+
"""
|
25 |
+
# Load the wav from disk if needed
|
26 |
+
if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
|
27 |
+
wav, source_sr = librosa.load(str(fpath_or_wav), sr=None)
|
28 |
+
else:
|
29 |
+
wav = fpath_or_wav
|
30 |
+
|
31 |
+
# Resample the wav
|
32 |
+
if source_sr is not None:
|
33 |
+
wav = librosa.resample(wav, orig_sr=source_sr, target_sr=sampling_rate)
|
34 |
+
|
35 |
+
# Apply the preprocessing: normalize volume and shorten long silences
|
36 |
+
wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
|
37 |
+
wav = trim_long_silences(wav)
|
38 |
+
|
39 |
+
return wav
|
40 |
+
|
41 |
+
|
42 |
+
def wav_to_mel_spectrogram(wav):
|
43 |
+
"""
|
44 |
+
Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
|
45 |
+
Note: this not a log-mel spectrogram.
|
46 |
+
"""
|
47 |
+
frames = librosa.feature.melspectrogram(
|
48 |
+
y=wav,
|
49 |
+
sr=sampling_rate,
|
50 |
+
n_fft=int(sampling_rate * mel_window_length / 1000),
|
51 |
+
hop_length=int(sampling_rate * mel_window_step / 1000),
|
52 |
+
n_mels=mel_n_channels
|
53 |
+
)
|
54 |
+
return frames.astype(np.float32).T
|
55 |
+
|
56 |
+
|
57 |
+
def trim_long_silences(wav):
|
58 |
+
"""
|
59 |
+
Ensures that segments without voice in the waveform remain no longer than a
|
60 |
+
threshold determined by the VAD parameters in params.py.
|
61 |
+
|
62 |
+
:param wav: the raw waveform as a numpy array of floats
|
63 |
+
:return: the same waveform with silences trimmed away (length <= original wav length)
|
64 |
+
"""
|
65 |
+
# Compute the voice detection window size
|
66 |
+
samples_per_window = (vad_window_length * sampling_rate) // 1000
|
67 |
+
|
68 |
+
# Trim the end of the audio to have a multiple of the window size
|
69 |
+
wav = wav[:len(wav) - (len(wav) % samples_per_window)]
|
70 |
+
|
71 |
+
# Convert the float waveform to 16-bit mono PCM
|
72 |
+
pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
|
73 |
+
|
74 |
+
# Perform voice activation detection
|
75 |
+
voice_flags = []
|
76 |
+
vad = webrtcvad.Vad(mode=3)
|
77 |
+
for window_start in range(0, len(wav), samples_per_window):
|
78 |
+
window_end = window_start + samples_per_window
|
79 |
+
voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
|
80 |
+
sample_rate=sampling_rate))
|
81 |
+
voice_flags = np.array(voice_flags)
|
82 |
+
|
83 |
+
# Smooth the voice detection with a moving average
|
84 |
+
def moving_average(array, width):
|
85 |
+
array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
|
86 |
+
ret = np.cumsum(array_padded, dtype=float)
|
87 |
+
ret[width:] = ret[width:] - ret[:-width]
|
88 |
+
return ret[width - 1:] / width
|
89 |
+
|
90 |
+
audio_mask = moving_average(voice_flags, vad_moving_average_width)
|
91 |
+
audio_mask = np.round(audio_mask).astype(np.bool_)
|
92 |
+
|
93 |
+
# Dilate the voiced regions
|
94 |
+
audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
|
95 |
+
audio_mask = np.repeat(audio_mask, samples_per_window)
|
96 |
+
|
97 |
+
return wav[audio_mask == True]
|
98 |
+
|
99 |
+
|
100 |
+
def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False):
|
101 |
+
if increase_only and decrease_only:
|
102 |
+
raise ValueError("Both increase only and decrease only are set")
|
103 |
+
rms = np.sqrt(np.mean((wav * int16_max) ** 2))
|
104 |
+
wave_dBFS = 20 * np.log10(rms / int16_max)
|
105 |
+
dBFS_change = target_dBFS - wave_dBFS
|
106 |
+
if dBFS_change < 0 and increase_only or dBFS_change > 0 and decrease_only:
|
107 |
+
return wav
|
108 |
+
return wav * (10 ** (dBFS_change / 20))
|
resemblyzer/hparams.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
## Mel-filterbank
|
3 |
+
mel_window_length = 25 # In milliseconds
|
4 |
+
mel_window_step = 10 # In milliseconds
|
5 |
+
mel_n_channels = 40
|
6 |
+
|
7 |
+
|
8 |
+
## Audio
|
9 |
+
sampling_rate = 16000
|
10 |
+
# Number of spectrogram frames in a partial utterance
|
11 |
+
partials_n_frames = 160 # 1600 ms
|
12 |
+
|
13 |
+
|
14 |
+
## Voice Activation Detection
|
15 |
+
# Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
|
16 |
+
# This sets the granularity of the VAD. Should not need to be changed.
|
17 |
+
vad_window_length = 30 # In milliseconds
|
18 |
+
# Number of frames to average together when performing the moving average smoothing.
|
19 |
+
# The larger this value, the larger the VAD variations must be to not get smoothed out.
|
20 |
+
vad_moving_average_width = 8
|
21 |
+
# Maximum number of consecutive silent frames a segment can have.
|
22 |
+
vad_max_silence_length = 6
|
23 |
+
|
24 |
+
|
25 |
+
## Audio volume normalization
|
26 |
+
audio_norm_target_dBFS = -30
|
27 |
+
|
28 |
+
|
29 |
+
## Model parameters
|
30 |
+
model_hidden_size = 256
|
31 |
+
model_embedding_size = 256
|
32 |
+
model_num_layers = 3
|
33 |
+
|
resemblyzer/pretrained.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:39373b86598fa3da9fcddee6142382efe09777e8d37dc9c0561f41f0070f134e
|
3 |
+
size 17090379
|
resemblyzer/voice_encoder.py
ADDED
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from resemblyzer.hparams import *
|
2 |
+
from resemblyzer import audio
|
3 |
+
from pathlib import Path
|
4 |
+
from typing import Union, List
|
5 |
+
from torch import nn
|
6 |
+
from time import perf_counter as timer
|
7 |
+
import numpy as np
|
8 |
+
import torch
|
9 |
+
|
10 |
+
|
11 |
+
class VoiceEncoder(nn.Module):
|
12 |
+
def __init__(self, device: Union[str, torch.device]=None, verbose=True, weights_fpath: Union[Path, str]=None):
|
13 |
+
"""
|
14 |
+
If None, defaults to cuda if it is available on your machine, otherwise the model will
|
15 |
+
run on cpu. Outputs are always returned on the cpu, as numpy arrays.
|
16 |
+
:param weights_fpath: path to "<CUSTOM_MODEL>.pt" file path.
|
17 |
+
If None, defaults to built-in "pretrained.pt" model
|
18 |
+
"""
|
19 |
+
super().__init__()
|
20 |
+
|
21 |
+
# Define the network
|
22 |
+
self.lstm = nn.LSTM(mel_n_channels, model_hidden_size, model_num_layers, batch_first=True)
|
23 |
+
self.linear = nn.Linear(model_hidden_size, model_embedding_size)
|
24 |
+
self.relu = nn.ReLU()
|
25 |
+
|
26 |
+
# Get the target device
|
27 |
+
if device is None:
|
28 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
29 |
+
elif isinstance(device, str):
|
30 |
+
device = torch.device(device)
|
31 |
+
self.device = device
|
32 |
+
|
33 |
+
# Load the pretrained model'speaker weights
|
34 |
+
if weights_fpath is None:
|
35 |
+
weights_fpath = Path(__file__).resolve().parent.joinpath("pretrained.pt")
|
36 |
+
else:
|
37 |
+
weights_fpath = Path(weights_fpath)
|
38 |
+
|
39 |
+
if not weights_fpath.exists():
|
40 |
+
raise Exception("Couldn't find the voice encoder pretrained model at %s." %
|
41 |
+
weights_fpath)
|
42 |
+
start = timer()
|
43 |
+
checkpoint = torch.load(weights_fpath, map_location="cpu")
|
44 |
+
self.load_state_dict(checkpoint["model_state"], strict=False)
|
45 |
+
self.to(device)
|
46 |
+
|
47 |
+
if verbose:
|
48 |
+
print("Loaded the voice encoder model on %s in %.2f seconds." %
|
49 |
+
(device.type, timer() - start))
|
50 |
+
|
51 |
+
def forward(self, mels: torch.FloatTensor):
|
52 |
+
"""
|
53 |
+
Computes the embeddings of a batch of utterance spectrograms.
|
54 |
+
|
55 |
+
:param mels: a batch of mel spectrograms of same duration as a float32 tensor of shape
|
56 |
+
(batch_size, n_frames, n_channels)
|
57 |
+
:return: the embeddings as a float 32 tensor of shape (batch_size, embedding_size).
|
58 |
+
Embeddings are positive and L2-normed, thus they lay in the range [0, 1].
|
59 |
+
"""
|
60 |
+
# Pass the input through the LSTM layers and retrieve the final hidden state of the last
|
61 |
+
# layer. Apply a cutoff to 0 for negative values and L2 normalize the embeddings.
|
62 |
+
_, (hidden, _) = self.lstm(mels)
|
63 |
+
embeds_raw = self.relu(self.linear(hidden[-1]))
|
64 |
+
return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
|
65 |
+
|
66 |
+
@staticmethod
|
67 |
+
def compute_partial_slices(n_samples: int, rate, min_coverage):
|
68 |
+
"""
|
69 |
+
Computes where to split an utterance waveform and its corresponding mel spectrogram to
|
70 |
+
obtain partial utterances of <partials_n_frames> each. Both the waveform and the
|
71 |
+
mel spectrogram slices are returned, so as to make each partial utterance waveform
|
72 |
+
correspond to its spectrogram.
|
73 |
+
|
74 |
+
The returned ranges may be indexing further than the length of the waveform. It is
|
75 |
+
recommended that you pad the waveform with zeros up to wav_slices[-1].stop.
|
76 |
+
|
77 |
+
:param n_samples: the number of samples in the waveform
|
78 |
+
:param rate: how many partial utterances should occur per second. Partial utterances must
|
79 |
+
cover the span of the entire utterance, thus the rate should not be lower than the inverse
|
80 |
+
of the duration of a partial utterance. By default, partial utterances are 1.6s long and
|
81 |
+
the minimum rate is thus 0.625.
|
82 |
+
:param min_coverage: when reaching the last partial utterance, it may or may not have
|
83 |
+
enough frames. If at least <min_pad_coverage> of <partials_n_frames> are present,
|
84 |
+
then the last partial utterance will be considered by zero-padding the audio. Otherwise,
|
85 |
+
it will be discarded. If there aren't enough frames for one partial utterance,
|
86 |
+
this parameter is ignored so that the function always returns at least one slice.
|
87 |
+
:return: the waveform slices and mel spectrogram slices as lists of array slices. Index
|
88 |
+
respectively the waveform and the mel spectrogram with these slices to obtain the partial
|
89 |
+
utterances.
|
90 |
+
"""
|
91 |
+
assert 0 < min_coverage <= 1
|
92 |
+
|
93 |
+
# Compute how many frames separate two partial utterances
|
94 |
+
samples_per_frame = int((sampling_rate * mel_window_step / 1000))
|
95 |
+
n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
|
96 |
+
frame_step = int(np.round((sampling_rate / rate) / samples_per_frame))
|
97 |
+
assert 0 < frame_step, "The rate is too high"
|
98 |
+
assert frame_step <= partials_n_frames, "The rate is too low, it should be %f at least" % \
|
99 |
+
(sampling_rate / (samples_per_frame * partials_n_frames))
|
100 |
+
|
101 |
+
# Compute the slices
|
102 |
+
wav_slices, mel_slices = [], []
|
103 |
+
steps = max(1, n_frames - partials_n_frames + frame_step + 1)
|
104 |
+
for i in range(0, steps, frame_step):
|
105 |
+
mel_range = np.array([i, i + partials_n_frames])
|
106 |
+
wav_range = mel_range * samples_per_frame
|
107 |
+
mel_slices.append(slice(*mel_range))
|
108 |
+
wav_slices.append(slice(*wav_range))
|
109 |
+
|
110 |
+
# Evaluate whether extra padding is warranted or not
|
111 |
+
last_wav_range = wav_slices[-1]
|
112 |
+
coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start)
|
113 |
+
if coverage < min_coverage and len(mel_slices) > 1:
|
114 |
+
mel_slices = mel_slices[:-1]
|
115 |
+
wav_slices = wav_slices[:-1]
|
116 |
+
|
117 |
+
return wav_slices, mel_slices
|
118 |
+
|
119 |
+
def embed_utterance(self, wav: np.ndarray, return_partials=False, rate=1.3, min_coverage=0.75):
|
120 |
+
"""
|
121 |
+
Computes an embedding for a single utterance. The utterance is divided in partial
|
122 |
+
utterances and an embedding is computed for each. The complete utterance embedding is the
|
123 |
+
L2-normed average embedding of the partial utterances.
|
124 |
+
|
125 |
+
TODO: independent batched version of this function
|
126 |
+
|
127 |
+
:param wav: a preprocessed utterance waveform as a numpy array of float32
|
128 |
+
:param return_partials: if True, the partial embeddings will also be returned along with
|
129 |
+
the wav slices corresponding to each partial utterance.
|
130 |
+
:param rate: how many partial utterances should occur per second. Partial utterances must
|
131 |
+
cover the span of the entire utterance, thus the rate should not be lower than the inverse
|
132 |
+
of the duration of a partial utterance. By default, partial utterances are 1.6s long and
|
133 |
+
the minimum rate is thus 0.625.
|
134 |
+
:param min_coverage: when reaching the last partial utterance, it may or may not have
|
135 |
+
enough frames. If at least <min_pad_coverage> of <partials_n_frames> are present,
|
136 |
+
then the last partial utterance will be considered by zero-padding the audio. Otherwise,
|
137 |
+
it will be discarded. If there aren't enough frames for one partial utterance,
|
138 |
+
this parameter is ignored so that the function always returns at least one slice.
|
139 |
+
:return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If
|
140 |
+
<return_partials> is True, the partial utterances as a numpy array of float32 of shape
|
141 |
+
(n_partials, model_embedding_size) and the wav partials as a list of slices will also be
|
142 |
+
returned.
|
143 |
+
"""
|
144 |
+
# Compute where to split the utterance into partials and pad the waveform with zeros if
|
145 |
+
# the partial utterances cover a larger range.
|
146 |
+
wav_slices, mel_slices = self.compute_partial_slices(len(wav), rate, min_coverage)
|
147 |
+
max_wave_length = wav_slices[-1].stop
|
148 |
+
if max_wave_length >= len(wav):
|
149 |
+
wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")
|
150 |
+
|
151 |
+
# Split the utterance into partials and forward them through the model
|
152 |
+
mel = audio.wav_to_mel_spectrogram(wav)
|
153 |
+
mels = np.array([mel[s] for s in mel_slices])
|
154 |
+
with torch.no_grad():
|
155 |
+
mels = torch.from_numpy(mels).to(self.device)
|
156 |
+
partial_embeds = self(mels).cpu().numpy()
|
157 |
+
|
158 |
+
# Compute the utterance embedding from the partial embeddings
|
159 |
+
raw_embed = np.mean(partial_embeds, axis=0)
|
160 |
+
embed = raw_embed / np.linalg.norm(raw_embed, 2)
|
161 |
+
|
162 |
+
if return_partials:
|
163 |
+
return embed, partial_embeds, wav_slices
|
164 |
+
return embed
|
165 |
+
|
166 |
+
def embed_speaker(self, wavs: List[np.ndarray], **kwargs):
|
167 |
+
"""
|
168 |
+
Compute the embedding of a collection of wavs (presumably from the same speaker) by
|
169 |
+
averaging their embedding and L2-normalizing it.
|
170 |
+
|
171 |
+
:param wavs: list of wavs a numpy arrays of float32.
|
172 |
+
:param kwargs: extra arguments to embed_utterance()
|
173 |
+
:return: the embedding as a numpy array of float32 of shape (model_embedding_size,).
|
174 |
+
"""
|
175 |
+
raw_embed = np.mean([self.embed_utterance(wav, return_partials=False, **kwargs) \
|
176 |
+
for wav in wavs], axis=0)
|
177 |
+
return raw_embed / np.linalg.norm(raw_embed, 2)
|