File size: 2,130 Bytes

# #################################
# The recipe for training PIQ on the ESC50 dataset.
#
# Author:
#  * Cem Subakan 2022, 2023
#  * Francesco Paissan 2022, 2023
#  (based on the SpeechBrain UrbanSound8k recipe)
# #################################

sample_rate: 16000
use_vq: true
rec_loss_coef: 1
use_mask_output: true
mask_th: 0.35

device: cpu

# Feature parameters
n_mels: 80

# Number of classes
out_n_neurons: 50

# embedding_model: !new:custom_models.Conv2dEncoder_v2
embedding_model: !new:speechbrain.lobes.models.PIQ.Conv2dEncoder_v2
  dim: 256

classifier: !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier
  input_size: 256
  out_neurons: 50
  lin_blocks: 1

# Interpretation hyperparams
K: 1024

# pre-processing
n_fft: 1024
spec_mag_power: 0.5
hop_length: 11.6099
win_length: 23.2199
compute_stft: !new:speechbrain.processing.features.STFT
  n_fft: 1024
  hop_length: 11.6099
  win_length: 23.2199
  sample_rate: 16000

compute_fbank: !new:speechbrain.processing.features.Filterbank
  n_mels: 80
  n_fft: 1024
  sample_rate: 16000

compute_istft: !new:speechbrain.processing.features.ISTFT
  sample_rate: 16000
  hop_length: 11.6099
  win_length: 23.2199

label_encoder: !new:speechbrain.dataio.encoder.CategoricalEncoder
psi_model: !new:speechbrain.lobes.models.PIQ.VectorQuantizedPSI_Audio
      dim: 256
      K: 1024
      shared_keys: 0
      activate_class_partitioning: true
      use_adapter: true
      adapter_reduce_dim: true

modules:
  compute_stft: !ref <compute_stft>
  compute_fbank: !ref <compute_fbank>
  compute_istft: !ref <compute_istft>
  psi: !ref <psi_model>
  embedding_model: !ref <embedding_model>
  classifier: !ref <classifier>

pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
  loadables:
    embedding_model: !ref <embedding_model>
    classifier: !ref <classifier>
    psi: !ref <psi_model>
    label_encoder: !ref <label_encoder>
  paths:
    embedding_model: speechbrain/PIQ-ESC50/embedding_modelft.ckpt
    classifier: speechbrain/PIQ-ESC50/classifier.ckpt
    psi: speechbrain/PIQ-ESC50/psi_model.ckpt
    label_encoder: speechbrain/cnn14-esc50/label_encoder.txt