File size: 5,379 Bytes
4ef0f28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67bb75e
 
4ef0f28
67bb75e
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
# Generated 2024-02-06 from:
# /mnt3/speechbrain/recipes/LibriMix/separation/hparams/sepformer-libri3mix.yaml
# yamllint disable
# ################################
# Model: SepFormer for source separation
# https://arxiv.org/abs/2010.13154
# Dataset : Libri3Mix
# ################################
#
# Basic parameters
# Seed needs to be set at top of yaml, before objects with parameters are made
#
seed: 1234
__set_seed: !apply:torch.manual_seed [1234]

# Data params

# e.g. '/yourpath/Libri3Mix/train-clean-360/'
# the data folder is needed even if dynamic mixing is applied
data_folder: /mnt3/Libri3Mix_48k_own/Libri3Mix/

# This is needed only if dynamic mixing is applied
base_folder_dm: /yourpath/LibriSpeech/train-clean-360/

experiment_name: sepformer-libri3mix-48k
output_folder: results/sepformer-libri3mix-48k/1234
train_log: results/sepformer-libri3mix-48k/1234/train_log.txt
save_folder: results/sepformer-libri3mix-48k/1234/save
train_data: results/sepformer-libri3mix-48k/1234/save/libri3mix_train-360.csv
valid_data: results/sepformer-libri3mix-48k/1234/save/libri3mix_test.csv
test_data: results/sepformer-libri3mix-48k/1234/save/libri3mix_test.csv
skip_prep: false

ckpt_interval_minutes: 60

# Experiment params
precision: fp16 # bf16, fp16 or fp32 # Set it to True for mixed precision
num_spks: 3
noprogressbar: false
save_audio: false # Save estimated sources on disk
sample_rate: 48000

# Training parameters
N_epochs: 50
batch_size: 1
lr: 0.0005
clip_grad_norm: 5
loss_upper_lim: 999999  # this is the upper limit for an acceptable loss
# if True, the training sequences are cut to a specified length
limit_training_signal_len: true
# this is the length of sequences if we choose to limit
# the signal length of training sequences
training_signal_len: 100000

# Set it to True to dynamically create mixtures at training time
dynamic_mixing: false
use_wham_noise: true

# Parameters for data augmentation
use_wavedrop: false
use_speedperturb: true
use_rand_shift: false
min_shift: -8000
max_shift: 8000

# Speed perturbation
speed_changes: &id001 [95, 100, 105]

# Frequency drop: randomly drops a number of frequency bands to zero.
speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb
  orig_freq: 48000
  speeds: *id001
drop_freq_low: 0  # Min frequency band dropout probability
drop_freq_high: 1  # Max frequency band dropout probability
drop_freq_count_low: 1  # Min number of frequency bands to drop
drop_freq_count_high: 3  # Max number of frequency bands to drop
drop_freq_width: 0.05  # Width of frequency bands to drop

drop_freq: !new:speechbrain.augment.time_domain.DropFreq
  drop_freq_low: 0
  drop_freq_high: 1
  drop_freq_count_low: 1
  drop_freq_count_high: 3
  drop_freq_width: 0.05

# Time drop: randomly drops a number of temporal chunks.
drop_chunk_count_low: 1  # Min number of audio chunks to drop
drop_chunk_count_high: 5  # Max number of audio chunks to drop
drop_chunk_length_low: 1000  # Min length of audio chunks to drop
drop_chunk_length_high: 2000  # Max length of audio chunks to drop

drop_chunk: !new:speechbrain.augment.time_domain.DropChunk
  drop_length_low: 1000
  drop_length_high: 2000
  drop_count_low: 1
  drop_count_high: 5

# loss thresholding -- this thresholds the training loss
threshold_byloss: true
threshold: -30

# Encoder parameters
N_encoder_out: 256
out_channels: 256
kernel_size: 16
kernel_stride: 8
d_ffn: 1024

# Dataloader options
dataloader_opts:
  batch_size: 1
  num_workers: 3


# Specifying the network
Encoder: &id004 !new:speechbrain.lobes.models.dual_path.Encoder
  kernel_size: 16
  out_channels: 256


SBtfintra: &id002 !new:speechbrain.lobes.models.dual_path.SBTransformerBlock
  num_layers: 8
  d_model: 256
  nhead: 8
  d_ffn: 1024
  dropout: 0
  use_positional_encoding: true
  norm_before: true

SBtfinter: &id003 !new:speechbrain.lobes.models.dual_path.SBTransformerBlock
  num_layers: 8
  d_model: 256
  nhead: 8
  d_ffn: 1024
  dropout: 0
  use_positional_encoding: true
  norm_before: true

MaskNet: &id006 !new:speechbrain.lobes.models.dual_path.Dual_Path_Model

  num_spks: 3
  in_channels: 256
  out_channels: 256
  num_layers: 2
  K: 250
  intra_model: *id002
  inter_model: *id003
  norm: ln
  linear_layer_after_inter_intra: false
  skip_around_intra: true

Decoder: &id005 !new:speechbrain.lobes.models.dual_path.Decoder
  in_channels: 256
  out_channels: 1
  kernel_size: 16
  stride: 8
  bias: false

optimizer: !name:torch.optim.Adam
  lr: 0.0005
  weight_decay: 0

loss: !name:speechbrain.nnet.losses.get_si_snr_with_pitwrapper

lr_scheduler: !new:speechbrain.nnet.schedulers.ReduceLROnPlateau
  factor: 0.5
  patience: 2
  dont_halve_until_epoch: 5

epoch_counter: &id007 !new:speechbrain.utils.epoch_loop.EpochCounter
        # lr_scheduler: !ref <lr_scheduler>

  limit: 50

modules:
  encoder: *id004
  decoder: *id005
  masknet: *id006
checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
  checkpoints_dir: results/sepformer-libri3mix-48k/1234/save
  recoverables:
    encoder: *id004
    decoder: *id005
    masknet: *id006
    counter: *id007
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
  save_file: results/sepformer-libri3mix-48k/1234/train_log.txt


pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
  loadables:
     encoder: !ref <Encoder>
     masknet: !ref <MaskNet>
     decoder: !ref <Decoder>