fhieni commited on
Commit
e68cefa
1 Parent(s): 8b5177e

Upload 9 files

Browse files
resemblyzer/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ name = "resemblyzer"
2
+
3
+ from resemblyzer.audio import preprocess_wav, wav_to_mel_spectrogram, trim_long_silences, \
4
+ normalize_volume
5
+ from resemblyzer.hparams import sampling_rate
6
+ from resemblyzer.voice_encoder import VoiceEncoder
resemblyzer/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (444 Bytes). View file
 
resemblyzer/__pycache__/audio.cpython-38.pyc ADDED
Binary file (3.8 kB). View file
 
resemblyzer/__pycache__/hparams.cpython-38.pyc ADDED
Binary file (514 Bytes). View file
 
resemblyzer/__pycache__/voice_encoder.cpython-38.pyc ADDED
Binary file (8.47 kB). View file
 
resemblyzer/audio.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from scipy.ndimage.morphology import binary_dilation
2
+ from resemblyzer.hparams import *
3
+ from pathlib import Path
4
+ from typing import Optional, Union
5
+ import numpy as np
6
+ import webrtcvad
7
+ import librosa
8
+ import struct
9
+
10
+ int16_max = (2 ** 15) - 1
11
+
12
+
13
+ def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray], source_sr: Optional[int]=None):
14
+ """
15
+ Applies preprocessing operations to a waveform either on disk or in memory such that
16
+ The waveform will be resampled to match the data hyperparameters.
17
+
18
+ :param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not
19
+ just .wav), either the waveform as a numpy array of floats.
20
+ :param source_sr: if passing an audio waveform, the sampling rate of the waveform before
21
+ preprocessing. After preprocessing, the waveform'speaker sampling rate will match the data
22
+ hyperparameters. If passing a filepath, the sampling rate will be automatically detected and
23
+ this argument will be ignored.
24
+ """
25
+ # Load the wav from disk if needed
26
+ if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
27
+ wav, source_sr = librosa.load(str(fpath_or_wav), sr=None)
28
+ else:
29
+ wav = fpath_or_wav
30
+
31
+ # Resample the wav
32
+ if source_sr is not None:
33
+ wav = librosa.resample(wav, orig_sr=source_sr, target_sr=sampling_rate)
34
+
35
+ # Apply the preprocessing: normalize volume and shorten long silences
36
+ wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
37
+ wav = trim_long_silences(wav)
38
+
39
+ return wav
40
+
41
+
42
+ def wav_to_mel_spectrogram(wav):
43
+ """
44
+ Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
45
+ Note: this not a log-mel spectrogram.
46
+ """
47
+ frames = librosa.feature.melspectrogram(
48
+ y=wav,
49
+ sr=sampling_rate,
50
+ n_fft=int(sampling_rate * mel_window_length / 1000),
51
+ hop_length=int(sampling_rate * mel_window_step / 1000),
52
+ n_mels=mel_n_channels
53
+ )
54
+ return frames.astype(np.float32).T
55
+
56
+
57
+ def trim_long_silences(wav):
58
+ """
59
+ Ensures that segments without voice in the waveform remain no longer than a
60
+ threshold determined by the VAD parameters in params.py.
61
+
62
+ :param wav: the raw waveform as a numpy array of floats
63
+ :return: the same waveform with silences trimmed away (length <= original wav length)
64
+ """
65
+ # Compute the voice detection window size
66
+ samples_per_window = (vad_window_length * sampling_rate) // 1000
67
+
68
+ # Trim the end of the audio to have a multiple of the window size
69
+ wav = wav[:len(wav) - (len(wav) % samples_per_window)]
70
+
71
+ # Convert the float waveform to 16-bit mono PCM
72
+ pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
73
+
74
+ # Perform voice activation detection
75
+ voice_flags = []
76
+ vad = webrtcvad.Vad(mode=3)
77
+ for window_start in range(0, len(wav), samples_per_window):
78
+ window_end = window_start + samples_per_window
79
+ voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
80
+ sample_rate=sampling_rate))
81
+ voice_flags = np.array(voice_flags)
82
+
83
+ # Smooth the voice detection with a moving average
84
+ def moving_average(array, width):
85
+ array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
86
+ ret = np.cumsum(array_padded, dtype=float)
87
+ ret[width:] = ret[width:] - ret[:-width]
88
+ return ret[width - 1:] / width
89
+
90
+ audio_mask = moving_average(voice_flags, vad_moving_average_width)
91
+ audio_mask = np.round(audio_mask).astype(np.bool_)
92
+
93
+ # Dilate the voiced regions
94
+ audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
95
+ audio_mask = np.repeat(audio_mask, samples_per_window)
96
+
97
+ return wav[audio_mask == True]
98
+
99
+
100
+ def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False):
101
+ if increase_only and decrease_only:
102
+ raise ValueError("Both increase only and decrease only are set")
103
+ rms = np.sqrt(np.mean((wav * int16_max) ** 2))
104
+ wave_dBFS = 20 * np.log10(rms / int16_max)
105
+ dBFS_change = target_dBFS - wave_dBFS
106
+ if dBFS_change < 0 and increase_only or dBFS_change > 0 and decrease_only:
107
+ return wav
108
+ return wav * (10 ** (dBFS_change / 20))
resemblyzer/hparams.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ## Mel-filterbank
3
+ mel_window_length = 25 # In milliseconds
4
+ mel_window_step = 10 # In milliseconds
5
+ mel_n_channels = 40
6
+
7
+
8
+ ## Audio
9
+ sampling_rate = 16000
10
+ # Number of spectrogram frames in a partial utterance
11
+ partials_n_frames = 160 # 1600 ms
12
+
13
+
14
+ ## Voice Activation Detection
15
+ # Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
16
+ # This sets the granularity of the VAD. Should not need to be changed.
17
+ vad_window_length = 30 # In milliseconds
18
+ # Number of frames to average together when performing the moving average smoothing.
19
+ # The larger this value, the larger the VAD variations must be to not get smoothed out.
20
+ vad_moving_average_width = 8
21
+ # Maximum number of consecutive silent frames a segment can have.
22
+ vad_max_silence_length = 6
23
+
24
+
25
+ ## Audio volume normalization
26
+ audio_norm_target_dBFS = -30
27
+
28
+
29
+ ## Model parameters
30
+ model_hidden_size = 256
31
+ model_embedding_size = 256
32
+ model_num_layers = 3
33
+
resemblyzer/pretrained.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39373b86598fa3da9fcddee6142382efe09777e8d37dc9c0561f41f0070f134e
3
+ size 17090379
resemblyzer/voice_encoder.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from resemblyzer.hparams import *
2
+ from resemblyzer import audio
3
+ from pathlib import Path
4
+ from typing import Union, List
5
+ from torch import nn
6
+ from time import perf_counter as timer
7
+ import numpy as np
8
+ import torch
9
+
10
+
11
+ class VoiceEncoder(nn.Module):
12
+ def __init__(self, device: Union[str, torch.device]=None, verbose=True, weights_fpath: Union[Path, str]=None):
13
+ """
14
+ If None, defaults to cuda if it is available on your machine, otherwise the model will
15
+ run on cpu. Outputs are always returned on the cpu, as numpy arrays.
16
+ :param weights_fpath: path to "<CUSTOM_MODEL>.pt" file path.
17
+ If None, defaults to built-in "pretrained.pt" model
18
+ """
19
+ super().__init__()
20
+
21
+ # Define the network
22
+ self.lstm = nn.LSTM(mel_n_channels, model_hidden_size, model_num_layers, batch_first=True)
23
+ self.linear = nn.Linear(model_hidden_size, model_embedding_size)
24
+ self.relu = nn.ReLU()
25
+
26
+ # Get the target device
27
+ if device is None:
28
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
29
+ elif isinstance(device, str):
30
+ device = torch.device(device)
31
+ self.device = device
32
+
33
+ # Load the pretrained model'speaker weights
34
+ if weights_fpath is None:
35
+ weights_fpath = Path(__file__).resolve().parent.joinpath("pretrained.pt")
36
+ else:
37
+ weights_fpath = Path(weights_fpath)
38
+
39
+ if not weights_fpath.exists():
40
+ raise Exception("Couldn't find the voice encoder pretrained model at %s." %
41
+ weights_fpath)
42
+ start = timer()
43
+ checkpoint = torch.load(weights_fpath, map_location="cpu")
44
+ self.load_state_dict(checkpoint["model_state"], strict=False)
45
+ self.to(device)
46
+
47
+ if verbose:
48
+ print("Loaded the voice encoder model on %s in %.2f seconds." %
49
+ (device.type, timer() - start))
50
+
51
+ def forward(self, mels: torch.FloatTensor):
52
+ """
53
+ Computes the embeddings of a batch of utterance spectrograms.
54
+
55
+ :param mels: a batch of mel spectrograms of same duration as a float32 tensor of shape
56
+ (batch_size, n_frames, n_channels)
57
+ :return: the embeddings as a float 32 tensor of shape (batch_size, embedding_size).
58
+ Embeddings are positive and L2-normed, thus they lay in the range [0, 1].
59
+ """
60
+ # Pass the input through the LSTM layers and retrieve the final hidden state of the last
61
+ # layer. Apply a cutoff to 0 for negative values and L2 normalize the embeddings.
62
+ _, (hidden, _) = self.lstm(mels)
63
+ embeds_raw = self.relu(self.linear(hidden[-1]))
64
+ return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
65
+
66
+ @staticmethod
67
+ def compute_partial_slices(n_samples: int, rate, min_coverage):
68
+ """
69
+ Computes where to split an utterance waveform and its corresponding mel spectrogram to
70
+ obtain partial utterances of <partials_n_frames> each. Both the waveform and the
71
+ mel spectrogram slices are returned, so as to make each partial utterance waveform
72
+ correspond to its spectrogram.
73
+
74
+ The returned ranges may be indexing further than the length of the waveform. It is
75
+ recommended that you pad the waveform with zeros up to wav_slices[-1].stop.
76
+
77
+ :param n_samples: the number of samples in the waveform
78
+ :param rate: how many partial utterances should occur per second. Partial utterances must
79
+ cover the span of the entire utterance, thus the rate should not be lower than the inverse
80
+ of the duration of a partial utterance. By default, partial utterances are 1.6s long and
81
+ the minimum rate is thus 0.625.
82
+ :param min_coverage: when reaching the last partial utterance, it may or may not have
83
+ enough frames. If at least <min_pad_coverage> of <partials_n_frames> are present,
84
+ then the last partial utterance will be considered by zero-padding the audio. Otherwise,
85
+ it will be discarded. If there aren't enough frames for one partial utterance,
86
+ this parameter is ignored so that the function always returns at least one slice.
87
+ :return: the waveform slices and mel spectrogram slices as lists of array slices. Index
88
+ respectively the waveform and the mel spectrogram with these slices to obtain the partial
89
+ utterances.
90
+ """
91
+ assert 0 < min_coverage <= 1
92
+
93
+ # Compute how many frames separate two partial utterances
94
+ samples_per_frame = int((sampling_rate * mel_window_step / 1000))
95
+ n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
96
+ frame_step = int(np.round((sampling_rate / rate) / samples_per_frame))
97
+ assert 0 < frame_step, "The rate is too high"
98
+ assert frame_step <= partials_n_frames, "The rate is too low, it should be %f at least" % \
99
+ (sampling_rate / (samples_per_frame * partials_n_frames))
100
+
101
+ # Compute the slices
102
+ wav_slices, mel_slices = [], []
103
+ steps = max(1, n_frames - partials_n_frames + frame_step + 1)
104
+ for i in range(0, steps, frame_step):
105
+ mel_range = np.array([i, i + partials_n_frames])
106
+ wav_range = mel_range * samples_per_frame
107
+ mel_slices.append(slice(*mel_range))
108
+ wav_slices.append(slice(*wav_range))
109
+
110
+ # Evaluate whether extra padding is warranted or not
111
+ last_wav_range = wav_slices[-1]
112
+ coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start)
113
+ if coverage < min_coverage and len(mel_slices) > 1:
114
+ mel_slices = mel_slices[:-1]
115
+ wav_slices = wav_slices[:-1]
116
+
117
+ return wav_slices, mel_slices
118
+
119
+ def embed_utterance(self, wav: np.ndarray, return_partials=False, rate=1.3, min_coverage=0.75):
120
+ """
121
+ Computes an embedding for a single utterance. The utterance is divided in partial
122
+ utterances and an embedding is computed for each. The complete utterance embedding is the
123
+ L2-normed average embedding of the partial utterances.
124
+
125
+ TODO: independent batched version of this function
126
+
127
+ :param wav: a preprocessed utterance waveform as a numpy array of float32
128
+ :param return_partials: if True, the partial embeddings will also be returned along with
129
+ the wav slices corresponding to each partial utterance.
130
+ :param rate: how many partial utterances should occur per second. Partial utterances must
131
+ cover the span of the entire utterance, thus the rate should not be lower than the inverse
132
+ of the duration of a partial utterance. By default, partial utterances are 1.6s long and
133
+ the minimum rate is thus 0.625.
134
+ :param min_coverage: when reaching the last partial utterance, it may or may not have
135
+ enough frames. If at least <min_pad_coverage> of <partials_n_frames> are present,
136
+ then the last partial utterance will be considered by zero-padding the audio. Otherwise,
137
+ it will be discarded. If there aren't enough frames for one partial utterance,
138
+ this parameter is ignored so that the function always returns at least one slice.
139
+ :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If
140
+ <return_partials> is True, the partial utterances as a numpy array of float32 of shape
141
+ (n_partials, model_embedding_size) and the wav partials as a list of slices will also be
142
+ returned.
143
+ """
144
+ # Compute where to split the utterance into partials and pad the waveform with zeros if
145
+ # the partial utterances cover a larger range.
146
+ wav_slices, mel_slices = self.compute_partial_slices(len(wav), rate, min_coverage)
147
+ max_wave_length = wav_slices[-1].stop
148
+ if max_wave_length >= len(wav):
149
+ wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")
150
+
151
+ # Split the utterance into partials and forward them through the model
152
+ mel = audio.wav_to_mel_spectrogram(wav)
153
+ mels = np.array([mel[s] for s in mel_slices])
154
+ with torch.no_grad():
155
+ mels = torch.from_numpy(mels).to(self.device)
156
+ partial_embeds = self(mels).cpu().numpy()
157
+
158
+ # Compute the utterance embedding from the partial embeddings
159
+ raw_embed = np.mean(partial_embeds, axis=0)
160
+ embed = raw_embed / np.linalg.norm(raw_embed, 2)
161
+
162
+ if return_partials:
163
+ return embed, partial_embeds, wav_slices
164
+ return embed
165
+
166
+ def embed_speaker(self, wavs: List[np.ndarray], **kwargs):
167
+ """
168
+ Compute the embedding of a collection of wavs (presumably from the same speaker) by
169
+ averaging their embedding and L2-normalizing it.
170
+
171
+ :param wavs: list of wavs a numpy arrays of float32.
172
+ :param kwargs: extra arguments to embed_utterance()
173
+ :return: the embedding as a numpy array of float32 of shape (model_embedding_size,).
174
+ """
175
+ raw_embed = np.mean([self.embed_utterance(wav, return_partials=False, **kwargs) \
176
+ for wav in wavs], axis=0)
177
+ return raw_embed / np.linalg.norm(raw_embed, 2)