Audio-to-Audio
ESPnet
audio
Wangyou Zhang commited on
Commit
c9fa345
1 Parent(s): 2a572f9

initial commit

Browse files
README.md CHANGED
@@ -1,3 +1,253 @@
1
  ---
 
 
 
 
 
 
 
2
  license: cc-by-4.0
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - audio-to-audio
6
+ language:
7
+ datasets:
8
+ - wsj0-2mix
9
  license: cc-by-4.0
10
  ---
11
+
12
+ ## ESPnet2 ENH model
13
+
14
+ ### `espnet/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw`
15
+
16
+ This model was trained by Wangyou Zhang using wsj0_2mix recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ ```bash
21
+ cd espnet
22
+
23
+ pip install -e .
24
+ cd egs2/wsj0_2mix/enh1
25
+ ./run.sh --skip_data_prep false --skip_train true --download_model espnet/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw
26
+ ```
27
+
28
+
29
+
30
+ ## ENH config
31
+
32
+ <details><summary>expand</summary>
33
+
34
+ ```
35
+ config: conf/tuning/train_enh_dptnet.yaml
36
+ print_config: false
37
+ log_level: INFO
38
+ dry_run: false
39
+ iterator_type: chunk
40
+ output_dir: exp/enh_train_enh_dptnet_raw
41
+ ngpu: 1
42
+ seed: 0
43
+ num_workers: 4
44
+ num_att_plot: 3
45
+ dist_backend: nccl
46
+ dist_init_method: env://
47
+ dist_world_size: 4
48
+ dist_rank: 0
49
+ local_rank: 0
50
+ dist_master_addr: localhost
51
+ dist_master_port: 53094
52
+ dist_launcher: null
53
+ multiprocessing_distributed: true
54
+ unused_parameters: true
55
+ sharded_ddp: false
56
+ cudnn_enabled: true
57
+ cudnn_benchmark: false
58
+ cudnn_deterministic: true
59
+ collect_stats: false
60
+ write_collected_feats: false
61
+ validate_train_iter: false
62
+ max_epoch: 150
63
+ patience: 10
64
+ val_scheduler_criterion:
65
+ - valid
66
+ - loss
67
+ early_stopping_criterion:
68
+ - valid
69
+ - loss
70
+ - min
71
+ best_model_criterion:
72
+ - - valid
73
+ - si_snr
74
+ - max
75
+ - - valid
76
+ - loss
77
+ - min
78
+ keep_nbest_models: 1
79
+ nbest_averaging_interval: 0
80
+ grad_clip: 5
81
+ grad_clip_type: 2.0
82
+ grad_noise: false
83
+ accum_grad: 1
84
+ no_forward_run: false
85
+ resume: true
86
+ train_dtype: float32
87
+ use_amp: false
88
+ log_interval: null
89
+ use_matplotlib: true
90
+ use_tensorboard: true
91
+ use_wandb: false
92
+ wandb_project: null
93
+ wandb_id: null
94
+ wandb_entity: null
95
+ wandb_name: null
96
+ wandb_model_log_interval: -1
97
+ detect_anomaly: false
98
+ pretrain_path: null
99
+ init_param: []
100
+ ignore_init_mismatch: false
101
+ freeze_param: []
102
+ num_iters_per_epoch: null
103
+ batch_size: 4
104
+ valid_batch_size: null
105
+ batch_bins: 1000000
106
+ valid_batch_bins: null
107
+ train_shape_file:
108
+ - exp/enh_stats_8k/train/speech_mix_shape
109
+ - exp/enh_stats_8k/train/speech_ref1_shape
110
+ - exp/enh_stats_8k/train/speech_ref2_shape
111
+ valid_shape_file:
112
+ - exp/enh_stats_8k/valid/speech_mix_shape
113
+ - exp/enh_stats_8k/valid/speech_ref1_shape
114
+ - exp/enh_stats_8k/valid/speech_ref2_shape
115
+ batch_type: folded
116
+ valid_batch_type: null
117
+ fold_length:
118
+ - 80000
119
+ - 80000
120
+ - 80000
121
+ sort_in_batch: descending
122
+ sort_batch: descending
123
+ multiple_iterator: false
124
+ chunk_length: 20000
125
+ chunk_shift_ratio: 0.5
126
+ num_cache_chunks: 1024
127
+ train_data_path_and_name_and_type:
128
+ - - dump/raw/tr_min_8k/wav.scp
129
+ - speech_mix
130
+ - sound
131
+ - - dump/raw/tr_min_8k/spk1.scp
132
+ - speech_ref1
133
+ - sound
134
+ - - dump/raw/tr_min_8k/spk2.scp
135
+ - speech_ref2
136
+ - sound
137
+ valid_data_path_and_name_and_type:
138
+ - - dump/raw/cv_min_8k/wav.scp
139
+ - speech_mix
140
+ - sound
141
+ - - dump/raw/cv_min_8k/spk1.scp
142
+ - speech_ref1
143
+ - sound
144
+ - - dump/raw/cv_min_8k/spk2.scp
145
+ - speech_ref2
146
+ - sound
147
+ allow_variable_data_keys: false
148
+ max_cache_size: 0.0
149
+ max_cache_fd: 32
150
+ valid_max_cache_size: null
151
+ optim: adam
152
+ optim_conf:
153
+ lr: 0.0004
154
+ eps: 1.0e-08
155
+ weight_decay: 1.0e-05
156
+ scheduler: warmupsteplr
157
+ scheduler_conf:
158
+ warmup_steps: 4000
159
+ steps_per_epoch: 14273
160
+ step_size: 2
161
+ gamma: 0.98
162
+ init: null
163
+ model_conf:
164
+ stft_consistency: false
165
+ loss_type: mask_mse
166
+ mask_type: null
167
+ criterions:
168
+ - name: si_snr
169
+ conf:
170
+ eps: 1.0e-07
171
+ wrapper: pit
172
+ wrapper_conf:
173
+ weight: 1.0
174
+ independent_perm: true
175
+ use_preprocessor: false
176
+ encoder: conv
177
+ encoder_conf:
178
+ channel: 64
179
+ kernel_size: 2
180
+ stride: 1
181
+ separator: dptnet
182
+ separator_conf:
183
+ num_spk: 2
184
+ post_enc_relu: true
185
+ layer: 6
186
+ rnn_type: lstm
187
+ bidirectional: true
188
+ unit: 128
189
+ att_heads: 4
190
+ dropout: 0.0
191
+ activation: relu
192
+ norm_type: gLN
193
+ segment_size: 250
194
+ nonlinear: relu
195
+ decoder: conv
196
+ decoder_conf:
197
+ channel: 64
198
+ kernel_size: 2
199
+ stride: 1
200
+ required:
201
+ - output_dir
202
+ version: 0.10.7a1
203
+ distributed: true
204
+ ```
205
+
206
+ </details>
207
+
208
+
209
+
210
+ ### Citing ESPnet
211
+
212
+ ```BibTex
213
+ @inproceedings{watanabe2018espnet,
214
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
215
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
216
+ year={2018},
217
+ booktitle={Proceedings of Interspeech},
218
+ pages={2207--2211},
219
+ doi={10.21437/Interspeech.2018-1456},
220
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
221
+ }
222
+
223
+ @inproceedings{li2021espnetse,
224
+ title={{ESPnet-SE}: End-to-End Speech Enhancement and Separation Toolkit Designed for {ASR} Integration},
225
+ author={Li, Chenda and Shi, Jing and Zhang, Wangyou and Subramanian, Aswin Shanmugam and Chang, Xuankai and Kamo, Naoyuki and Hira, Moto and Hayashi, Tomoki and Boeddeker, Christoph and Chen, Zhuo and Watanabe, Shinji},
226
+ booktitle={Proc. IEEE Spoken Language Technology Workshop (SLT)},
227
+ pages={785--792},
228
+ year={2021},
229
+ }
230
+
231
+ ```
232
+
233
+ or arXiv:
234
+
235
+ ```bibtex
236
+ @misc{watanabe2018espnet,
237
+ title={ESPnet: End-to-End Speech Processing Toolkit},
238
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
239
+ year={2018},
240
+ eprint={1804.00015},
241
+ archivePrefix={arXiv},
242
+ primaryClass={cs.CL}
243
+ }
244
+
245
+ @inproceedings{li2021espnetse,
246
+ title={{ESPnet-SE}: End-to-End Speech Enhancement and Separation Toolkit Designed for {ASR} Integration},
247
+ author={Li, Chenda and Shi, Jing and Zhang, Wangyou and Subramanian, Aswin Shanmugam and Chang, Xuankai and Kamo, Naoyuki and Hira, Moto and Hayashi, Tomoki and Boeddeker, Christoph and Chen, Zhuo and Watanabe, Shinji},
248
+ year={2020},
249
+ eprint={2011.03706},
250
+ archivePrefix={arXiv},
251
+ primaryClass={eess.AS}
252
+ }
253
+ ```
exp/enh_stats_8k/train/feats_stats.npz ADDED
Binary file (778 Bytes). View file
 
exp/enh_train_enh_dptnet_raw/99epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34bbfa87de88766844af4c3d313e34ef99e15194e6f394df354ca5fb6564bb0c
3
+ size 11274659
exp/enh_train_enh_dptnet_raw/RESULTS.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- Generated by ./scripts/utils/show_enh_score.sh -->
2
+ # RESULTS
3
+ ## Environments
4
+ - date: `Tue Jun 21 20:50:00 CST 2022`
5
+ - python version: `3.8.12 (default, Oct 12 2021, 13:49:34) [GCC 7.5.0]`
6
+ - espnet version: `espnet 0.10.7a1`
7
+ - pytorch version: `pytorch 1.10.2+cu102`
8
+ - Git hash: `9c24b3adddbde3402530080cb58ae08a6f4dd642`
9
+ - Commit date: `Wed Feb 23 14:49:15 2022 -0500`
10
+
11
+
12
+ ## enh_train_enh_dptnet_orig_raw
13
+
14
+ config: conf/tuning/train_enh_dptnet.yaml
15
+
16
+ |dataset|STOI|SAR|SDR|SIR|SI_SNR|
17
+ |---|---|---|---|---|---|
18
+ |enhanced_cv_min_8k|97.43|21.39|20.98|32.17|20.63|
19
+ |enhanced_tt_min_8k|98.18|21.47|21.06|32.48|20.72|
20
+
exp/enh_train_enh_dptnet_raw/config.yaml ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_enh_dptnet.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: chunk
6
+ output_dir: exp/enh_train_enh_dptnet_raw
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 4
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: 4
14
+ dist_rank: 0
15
+ local_rank: 0
16
+ dist_master_addr: localhost
17
+ dist_master_port: 53094
18
+ dist_launcher: null
19
+ multiprocessing_distributed: true
20
+ unused_parameters: true
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: true
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ validate_train_iter: false
28
+ max_epoch: 150
29
+ patience: 10
30
+ val_scheduler_criterion:
31
+ - valid
32
+ - loss
33
+ early_stopping_criterion:
34
+ - valid
35
+ - loss
36
+ - min
37
+ best_model_criterion:
38
+ - - valid
39
+ - si_snr
40
+ - max
41
+ - - valid
42
+ - loss
43
+ - min
44
+ keep_nbest_models: 1
45
+ nbest_averaging_interval: 0
46
+ grad_clip: 5
47
+ grad_clip_type: 2.0
48
+ grad_noise: false
49
+ accum_grad: 1
50
+ no_forward_run: false
51
+ resume: true
52
+ train_dtype: float32
53
+ use_amp: false
54
+ log_interval: null
55
+ use_matplotlib: true
56
+ use_tensorboard: true
57
+ use_wandb: false
58
+ wandb_project: null
59
+ wandb_id: null
60
+ wandb_entity: null
61
+ wandb_name: null
62
+ wandb_model_log_interval: -1
63
+ detect_anomaly: false
64
+ pretrain_path: null
65
+ init_param: []
66
+ ignore_init_mismatch: false
67
+ freeze_param: []
68
+ num_iters_per_epoch: null
69
+ batch_size: 4
70
+ valid_batch_size: null
71
+ batch_bins: 1000000
72
+ valid_batch_bins: null
73
+ train_shape_file:
74
+ - exp/enh_stats_8k/train/speech_mix_shape
75
+ - exp/enh_stats_8k/train/speech_ref1_shape
76
+ - exp/enh_stats_8k/train/speech_ref2_shape
77
+ valid_shape_file:
78
+ - exp/enh_stats_8k/valid/speech_mix_shape
79
+ - exp/enh_stats_8k/valid/speech_ref1_shape
80
+ - exp/enh_stats_8k/valid/speech_ref2_shape
81
+ batch_type: folded
82
+ valid_batch_type: null
83
+ fold_length:
84
+ - 80000
85
+ - 80000
86
+ - 80000
87
+ sort_in_batch: descending
88
+ sort_batch: descending
89
+ multiple_iterator: false
90
+ chunk_length: 20000
91
+ chunk_shift_ratio: 0.5
92
+ num_cache_chunks: 1024
93
+ train_data_path_and_name_and_type:
94
+ - - dump/raw/tr_min_8k/wav.scp
95
+ - speech_mix
96
+ - sound
97
+ - - dump/raw/tr_min_8k/spk1.scp
98
+ - speech_ref1
99
+ - sound
100
+ - - dump/raw/tr_min_8k/spk2.scp
101
+ - speech_ref2
102
+ - sound
103
+ valid_data_path_and_name_and_type:
104
+ - - dump/raw/cv_min_8k/wav.scp
105
+ - speech_mix
106
+ - sound
107
+ - - dump/raw/cv_min_8k/spk1.scp
108
+ - speech_ref1
109
+ - sound
110
+ - - dump/raw/cv_min_8k/spk2.scp
111
+ - speech_ref2
112
+ - sound
113
+ allow_variable_data_keys: false
114
+ max_cache_size: 0.0
115
+ max_cache_fd: 32
116
+ valid_max_cache_size: null
117
+ optim: adam
118
+ optim_conf:
119
+ lr: 0.0004
120
+ eps: 1.0e-08
121
+ weight_decay: 1.0e-05
122
+ scheduler: warmupsteplr
123
+ scheduler_conf:
124
+ warmup_steps: 4000
125
+ steps_per_epoch: 14273
126
+ step_size: 2
127
+ gamma: 0.98
128
+ init: null
129
+ model_conf:
130
+ stft_consistency: false
131
+ loss_type: mask_mse
132
+ mask_type: null
133
+ criterions:
134
+ - name: si_snr
135
+ conf:
136
+ eps: 1.0e-07
137
+ wrapper: pit
138
+ wrapper_conf:
139
+ weight: 1.0
140
+ independent_perm: true
141
+ use_preprocessor: false
142
+ encoder: conv
143
+ encoder_conf:
144
+ channel: 64
145
+ kernel_size: 2
146
+ stride: 1
147
+ separator: dptnet
148
+ separator_conf:
149
+ num_spk: 2
150
+ post_enc_relu: true
151
+ layer: 6
152
+ rnn_type: lstm
153
+ bidirectional: true
154
+ unit: 128
155
+ att_heads: 4
156
+ dropout: 0.0
157
+ activation: relu
158
+ norm_type: gLN
159
+ segment_size: 250
160
+ nonlinear: relu
161
+ decoder: conv
162
+ decoder_conf:
163
+ channel: 64
164
+ kernel_size: 2
165
+ stride: 1
166
+ required:
167
+ - output_dir
168
+ version: 0.10.7a1
169
+ distributed: true
exp/enh_train_enh_dptnet_raw/images/backward_time.png ADDED
exp/enh_train_enh_dptnet_raw/images/forward_time.png ADDED
exp/enh_train_enh_dptnet_raw/images/gpu_max_cached_mem_GB.png ADDED
exp/enh_train_enh_dptnet_raw/images/iter_time.png ADDED
exp/enh_train_enh_dptnet_raw/images/loss.png ADDED
exp/enh_train_enh_dptnet_raw/images/optim0_lr0.png ADDED
exp/enh_train_enh_dptnet_raw/images/optim_step_time.png ADDED
exp/enh_train_enh_dptnet_raw/images/si_snr_loss.png ADDED
exp/enh_train_enh_dptnet_raw/images/train_time.png ADDED
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: 0.10.7a1
2
+ files:
3
+ model_file: exp/enh_train_enh_dptnet_raw/99epoch.pth
4
+ python: "3.8.12 (default, Oct 12 2021, 13:49:34) \n[GCC 7.5.0]"
5
+ timestamp: 1655818843.663898
6
+ torch: 1.10.2+cu102
7
+ yaml_files:
8
+ train_config: exp/enh_train_enh_dptnet_raw/config.yaml