File size: 2,130 Bytes
c858225 6c4bc12 c858225 25df5de c858225 25df5de c858225 25df5de c858225 25df5de c858225 25df5de c858225 25df5de b665af3 c858225 b665af3 c858225 50c7f0a b665af3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
# #################################
# The recipe for training PIQ on the ESC50 dataset.
#
# Author:
# * Cem Subakan 2022, 2023
# * Francesco Paissan 2022, 2023
# (based on the SpeechBrain UrbanSound8k recipe)
# #################################
sample_rate: 16000
use_vq: true
rec_loss_coef: 1
use_mask_output: true
mask_th: 0.35
device: cpu
# Feature parameters
n_mels: 80
# Number of classes
out_n_neurons: 50
# embedding_model: !new:custom_models.Conv2dEncoder_v2
embedding_model: !new:speechbrain.lobes.models.PIQ.Conv2dEncoder_v2
dim: 256
classifier: !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier
input_size: 256
out_neurons: 50
lin_blocks: 1
# Interpretation hyperparams
K: 1024
# pre-processing
n_fft: 1024
spec_mag_power: 0.5
hop_length: 11.6099
win_length: 23.2199
compute_stft: !new:speechbrain.processing.features.STFT
n_fft: 1024
hop_length: 11.6099
win_length: 23.2199
sample_rate: 16000
compute_fbank: !new:speechbrain.processing.features.Filterbank
n_mels: 80
n_fft: 1024
sample_rate: 16000
compute_istft: !new:speechbrain.processing.features.ISTFT
sample_rate: 16000
hop_length: 11.6099
win_length: 23.2199
label_encoder: !new:speechbrain.dataio.encoder.CategoricalEncoder
psi_model: !new:speechbrain.lobes.models.PIQ.VectorQuantizedPSI_Audio
dim: 256
K: 1024
shared_keys: 0
activate_class_partitioning: true
use_adapter: true
adapter_reduce_dim: true
modules:
compute_stft: !ref <compute_stft>
compute_fbank: !ref <compute_fbank>
compute_istft: !ref <compute_istft>
psi: !ref <psi_model>
embedding_model: !ref <embedding_model>
classifier: !ref <classifier>
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
loadables:
embedding_model: !ref <embedding_model>
classifier: !ref <classifier>
psi: !ref <psi_model>
label_encoder: !ref <label_encoder>
paths:
embedding_model: speechbrain/PIQ-ESC50/embedding_modelft.ckpt
classifier: speechbrain/PIQ-ESC50/classifier.ckpt
psi: speechbrain/PIQ-ESC50/psi_model.ckpt
label_encoder: speechbrain/cnn14-esc50/label_encoder.txt |