File size: 2,130 Bytes
c858225
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c4bc12
c858225
 
 
 
 
 
 
 
25df5de
c858225
 
25df5de
c858225
 
 
 
 
 
 
 
 
 
 
 
25df5de
c858225
 
 
 
 
25df5de
c858225
 
 
 
25df5de
c858225
 
 
 
 
25df5de
 
 
 
 
 
 
b665af3
c858225
b665af3
 
 
 
c858225
 
 
 
 
 
 
 
 
 
50c7f0a
 
 
b665af3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# #################################
# The recipe for training PIQ on the ESC50 dataset.
#
# Author:
#  * Cem Subakan 2022, 2023
#  * Francesco Paissan 2022, 2023
#  (based on the SpeechBrain UrbanSound8k recipe)
# #################################

sample_rate: 16000
use_vq: true
rec_loss_coef: 1
use_mask_output: true
mask_th: 0.35

device: cpu

# Feature parameters
n_mels: 80

# Number of classes
out_n_neurons: 50

# embedding_model: !new:custom_models.Conv2dEncoder_v2
embedding_model: !new:speechbrain.lobes.models.PIQ.Conv2dEncoder_v2
  dim: 256

classifier: !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier
  input_size: 256
  out_neurons: 50
  lin_blocks: 1

# Interpretation hyperparams
K: 1024

# pre-processing
n_fft: 1024
spec_mag_power: 0.5
hop_length: 11.6099
win_length: 23.2199
compute_stft: !new:speechbrain.processing.features.STFT
  n_fft: 1024
  hop_length: 11.6099
  win_length: 23.2199
  sample_rate: 16000

compute_fbank: !new:speechbrain.processing.features.Filterbank
  n_mels: 80
  n_fft: 1024
  sample_rate: 16000

compute_istft: !new:speechbrain.processing.features.ISTFT
  sample_rate: 16000
  hop_length: 11.6099
  win_length: 23.2199

label_encoder: !new:speechbrain.dataio.encoder.CategoricalEncoder
psi_model: !new:speechbrain.lobes.models.PIQ.VectorQuantizedPSI_Audio
      dim: 256
      K: 1024
      shared_keys: 0
      activate_class_partitioning: true
      use_adapter: true
      adapter_reduce_dim: true

modules:
  compute_stft: !ref <compute_stft>
  compute_fbank: !ref <compute_fbank>
  compute_istft: !ref <compute_istft>
  psi: !ref <psi_model>
  embedding_model: !ref <embedding_model>
  classifier: !ref <classifier>

pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
  loadables:
    embedding_model: !ref <embedding_model>
    classifier: !ref <classifier>
    psi: !ref <psi_model>
    label_encoder: !ref <label_encoder>
  paths:
    embedding_model: speechbrain/PIQ-ESC50/embedding_modelft.ckpt
    classifier: speechbrain/PIQ-ESC50/classifier.ckpt
    psi: speechbrain/PIQ-ESC50/psi_model.ckpt
    label_encoder: speechbrain/cnn14-esc50/label_encoder.txt