File size: 5,987 Bytes
6f41a81 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 |
# Generated 2022-11-24 from:
# /home/pcp22wc/exps/speaker-recognition/hparams/train_tdnn.yaml
# yamllint disable
# ################################
# Model: Speaker identification with Vanilla TDNN (Xvector)
# Authors: Yang Wang
# ################################
# Basic parameters
seed: 914
__set_seed: !apply:torch.manual_seed [914]
output_folder: results/tdnn_augment/914
save_folder: results/tdnn_augment/914/save
train_log: results/tdnn_augment/914/train_log.txt
# Data files
data_folder: /fastdata/pcp22wc/audio/VoxCeleb2/dev, /fastdata/pcp22wc/audio/VoxCeleb1/test # e.g. /path/to/Voxceleb
train_annotation: results/tdnn_augment/914/save/train.csv
valid_annotation: results/tdnn_augment/914/save/dev.csv
# Folder to extract data augmentation files
rir_folder: /fastdata/pcp22wc/audio # Change it if needed
musan_folder: /fastdata/pcp22wc/audio/musan
music_csv: results/tdnn_augment/914/save/music.csv
noise_csv: results/tdnn_augment/914/save/noise.csv
speech_csv: results/tdnn_augment/914/save/speech.csv
# Use the following links for the official voxceleb splits:
# VoxCeleb1 (cleaned): https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test2.txt
# VoxCeleb1-H (cleaned): https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/list_test_hard2.txt
# VoxCeleb1-E (cleaned): https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/list_test_all2.txt.
# VoxCeleb1-E and VoxCeleb1-H lists are drawn from the VoxCeleb1 training set.
# Therefore you cannot use any files in VoxCeleb1 for training if you are using these lists for testing.
verification_file: https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test2.txt
skip_prep: true
ckpt_interval_minutes: 15 # save checkpoint every N min
# Training parameters
number_of_epochs: 30
batch_size: 512
lr: 0.001
lr_final: 0.0001
step_size: 65000
sample_rate: 16000
sentence_len: 3.0 # seconds
shuffle: true
random_chunk: true
# Feature parameters
n_mels: 80
deltas: false
# Number of speakers
out_n_neurons: 5994 #1211 for vox1 # 5994 for vox2, 7205 for vox1+vox2
dataloader_options:
batch_size: 512
shuffle: true
num_workers: 8
# Functions
compute_features: &id009 !new:speechbrain.lobes.features.Fbank
n_mels: 80
deltas: false
embedding_model: &id010 !new:speechbrain.lobes.models.Xvector.Xvector
in_channels: 80
activation: !name:torch.nn.LeakyReLU
tdnn_blocks: 5
tdnn_channels: [512, 512, 512, 512, 1500]
tdnn_kernel_sizes: [5, 3, 3, 1, 1]
tdnn_dilations: [1, 2, 3, 1, 1]
lin_neurons: 512
classifier: &id011 !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier
input_size: 512
out_neurons: 5994
epoch_counter: &id013 !new:speechbrain.utils.epoch_loop.EpochCounter
limit: 30
augment_wavedrop: &id001 !new:speechbrain.lobes.augment.TimeDomainSpecAugment
sample_rate: 16000
speeds: [100]
augment_speed: &id002 !new:speechbrain.lobes.augment.TimeDomainSpecAugment
sample_rate: 16000
speeds: [95, 100, 105]
add_rev: &id003 !new:speechbrain.lobes.augment.EnvCorrupt
openrir_folder: /fastdata/pcp22wc/audio
openrir_max_noise_len: 3.0 # seconds
reverb_prob: 1.0
noise_prob: 0.0
noise_snr_low: 0
noise_snr_high: 15
rir_scale_factor: 1.0
add_noise: &id004 !new:speechbrain.lobes.augment.EnvCorrupt
openrir_folder: /fastdata/pcp22wc/audio
openrir_max_noise_len: 3.0 # seconds
reverb_prob: 0.0
noise_prob: 1.0
noise_snr_low: 0
noise_snr_high: 15
rir_scale_factor: 1.0
add_rev_noise: &id005 !new:speechbrain.lobes.augment.EnvCorrupt
openrir_folder: /fastdata/pcp22wc/audio
openrir_max_noise_len: 3.0 # seconds
reverb_prob: 1.0
noise_prob: 1.0
noise_snr_low: 0
noise_snr_high: 15
rir_scale_factor: 1.0
add_noise_musan: &id006 !new:speechbrain.lobes.augment.EnvCorrupt
noise_csv: results/tdnn_augment/914/save/noise.csv
babble_prob: 0.0
reverb_prob: 0.0
noise_prob: 1.0
noise_snr_low: 0
noise_snr_high: 15
add_music_musan: &id007 !new:speechbrain.lobes.augment.EnvCorrupt
noise_csv: results/tdnn_augment/914/save/music.csv
babble_prob: 0.0
reverb_prob: 0.0
noise_prob: 1.0
noise_snr_low: 0
noise_snr_high: 15
add_speech_musan: &id008 !new:speechbrain.lobes.augment.EnvCorrupt
noise_csv: results/tdnn_augment/914/save/speech.csv
babble_prob: 0.0
reverb_prob: 0.0
noise_prob: 1.0
noise_snr_low: 0
noise_snr_high: 15
# Definition of the augmentation pipeline.
# If concat_augment = False, the augmentation techniques are applied
# in sequence. If concat_augment = True, all the augmented signals
# # are concatenated in a single big batch.
augment_pipeline: [*id001, *id002, *id003, *id004, *id005, *id006, *id007, *id008]
concat_augment: true
mean_var_norm: &id012 !new:speechbrain.processing.features.InputNormalization
norm_type: sentence
std_norm: false
modules:
compute_features: *id009
augment_wavedrop: *id001
augment_speed: *id002
add_rev: *id003
add_noise: *id004
add_rev_noise: *id005
add_noise_musan: *id006
add_music_musan: *id007
add_speech_musan: *id008
embedding_model: *id010
classifier: *id011
mean_var_norm: *id012
compute_cost: !new:speechbrain.nnet.losses.LogSoftmaxWrapper
loss_fn: !new:speechbrain.nnet.losses.AdditiveAngularMargin
margin: 0.2
scale: 30
# compute_error: !name:speechbrain.nnet.losses.classification_error
opt_class: !name:torch.optim.Adam
lr: 0.001
weight_decay: 0.000002
lr_annealing: !new:speechbrain.nnet.schedulers.LinearScheduler
initial_value: 0.001
final_value: 0.0001
epoch_count: 30
# Logging + checkpoints
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
save_file: results/tdnn_augment/914/train_log.txt
error_stats: !name:speechbrain.utils.metric_stats.MetricStats
metric: !name:speechbrain.nnet.losses.classification_error
reduction: batch
checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
checkpoints_dir: results/tdnn_augment/914/save
recoverables:
embedding_model: *id010
classifier: *id011
normalizer: *id012
counter: *id013
|