Siddhant commited on
Commit
0c625e7
1 Parent(s): 46957e3

import from zenodo

Browse files
README.md ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - text-to-speech
6
+ language: ja
7
+ datasets:
8
+ - jsut
9
+ license: cc-by-4.0
10
+ ---
11
+ ## Example ESPnet2 TTS model
12
+ ### `kan-bayashi/jsut_transformer_accent`
13
+ ♻️ Imported from https://zenodo.org/record/4381096/
14
+
15
+ This model was trained by kan-bayashi using jsut/tts1 recipe in [espnet](https://github.com/espnet/espnet/).
16
+ ### Demo: How to use in ESPnet2
17
+ ```python
18
+ # coming soon
19
+ ```
20
+ ### Citing ESPnet
21
+ ```BibTex
22
+ @inproceedings{watanabe2018espnet,
23
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson {Enrique Yalta Soplin} and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
24
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
25
+ year={2018},
26
+ booktitle={Proceedings of Interspeech},
27
+ pages={2207--2211},
28
+ doi={10.21437/Interspeech.2018-1456},
29
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
30
+ }
31
+ @inproceedings{hayashi2020espnet,
32
+ title={{Espnet-TTS}: Unified, reproducible, and integratable open source end-to-end text-to-speech toolkit},
33
+ author={Hayashi, Tomoki and Yamamoto, Ryuichi and Inoue, Katsuki and Yoshimura, Takenori and Watanabe, Shinji and Toda, Tomoki and Takeda, Kazuya and Zhang, Yu and Tan, Xu},
34
+ booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
35
+ pages={7654--7658},
36
+ year={2020},
37
+ organization={IEEE}
38
+ }
39
+ ```
40
+ or arXiv:
41
+ ```bibtex
42
+ @misc{watanabe2018espnet,
43
+ title={ESPnet: End-to-End Speech Processing Toolkit},
44
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Enrique Yalta Soplin and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
45
+ year={2018},
46
+ eprint={1804.00015},
47
+ archivePrefix={arXiv},
48
+ primaryClass={cs.CL}
49
+ }
50
+ ```
exp/tts_stats_raw_phn_jaconv_pyopenjtalk_accent/train/feats_stats.npz ADDED
Binary file (1.4 kB). View file
 
exp/tts_train_transformer_raw_phn_jaconv_pyopenjtalk_accent/config.yaml ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_transformer.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/tts_train_transformer_raw_phn_jaconv_pyopenjtalk_accent
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 1
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: 4
14
+ dist_rank: 0
15
+ local_rank: 0
16
+ dist_master_addr: localhost
17
+ dist_master_port: 33745
18
+ dist_launcher: null
19
+ multiprocessing_distributed: true
20
+ cudnn_enabled: true
21
+ cudnn_benchmark: false
22
+ cudnn_deterministic: true
23
+ collect_stats: false
24
+ write_collected_feats: false
25
+ max_epoch: 200
26
+ patience: null
27
+ val_scheduler_criterion:
28
+ - valid
29
+ - loss
30
+ early_stopping_criterion:
31
+ - valid
32
+ - loss
33
+ - min
34
+ best_model_criterion:
35
+ - - valid
36
+ - loss
37
+ - min
38
+ - - train
39
+ - loss
40
+ - min
41
+ keep_nbest_models: 5
42
+ grad_clip: 1.0
43
+ grad_clip_type: 2.0
44
+ grad_noise: false
45
+ accum_grad: 2
46
+ no_forward_run: false
47
+ resume: true
48
+ train_dtype: float32
49
+ use_amp: false
50
+ log_interval: null
51
+ unused_parameters: false
52
+ use_tensorboard: true
53
+ use_wandb: false
54
+ wandb_project: null
55
+ wandb_id: null
56
+ pretrain_path: null
57
+ init_param: []
58
+ num_iters_per_epoch: 1000
59
+ batch_size: 20
60
+ valid_batch_size: null
61
+ batch_bins: 9000000
62
+ valid_batch_bins: null
63
+ train_shape_file:
64
+ - exp/tts_stats_raw_phn_jaconv_pyopenjtalk_accent/train/text_shape.phn
65
+ - exp/tts_stats_raw_phn_jaconv_pyopenjtalk_accent/train/speech_shape
66
+ valid_shape_file:
67
+ - exp/tts_stats_raw_phn_jaconv_pyopenjtalk_accent/valid/text_shape.phn
68
+ - exp/tts_stats_raw_phn_jaconv_pyopenjtalk_accent/valid/speech_shape
69
+ batch_type: numel
70
+ valid_batch_type: null
71
+ fold_length:
72
+ - 150
73
+ - 240000
74
+ sort_in_batch: descending
75
+ sort_batch: descending
76
+ multiple_iterator: false
77
+ chunk_length: 500
78
+ chunk_shift_ratio: 0.5
79
+ num_cache_chunks: 1024
80
+ train_data_path_and_name_and_type:
81
+ - - dump/raw/tr_no_dev/text
82
+ - text
83
+ - text
84
+ - - dump/raw/tr_no_dev/wav.scp
85
+ - speech
86
+ - sound
87
+ valid_data_path_and_name_and_type:
88
+ - - dump/raw/dev/text
89
+ - text
90
+ - text
91
+ - - dump/raw/dev/wav.scp
92
+ - speech
93
+ - sound
94
+ allow_variable_data_keys: false
95
+ max_cache_size: 0.0
96
+ max_cache_fd: 32
97
+ valid_max_cache_size: null
98
+ optim: adam
99
+ optim_conf:
100
+ lr: 1.0
101
+ scheduler: noamlr
102
+ scheduler_conf:
103
+ model_size: 512
104
+ warmup_steps: 8000
105
+ token_list:
106
+ - <blank>
107
+ - <unk>
108
+ - '1'
109
+ - '2'
110
+ - '0'
111
+ - '3'
112
+ - '4'
113
+ - '-1'
114
+ - '5'
115
+ - a
116
+ - o
117
+ - '-2'
118
+ - i
119
+ - '-3'
120
+ - u
121
+ - e
122
+ - k
123
+ - n
124
+ - t
125
+ - '6'
126
+ - r
127
+ - '-4'
128
+ - s
129
+ - N
130
+ - m
131
+ - '7'
132
+ - sh
133
+ - d
134
+ - g
135
+ - w
136
+ - '8'
137
+ - U
138
+ - '-5'
139
+ - I
140
+ - cl
141
+ - h
142
+ - y
143
+ - b
144
+ - '9'
145
+ - j
146
+ - ts
147
+ - ch
148
+ - '-6'
149
+ - z
150
+ - p
151
+ - '-7'
152
+ - f
153
+ - ky
154
+ - ry
155
+ - '-8'
156
+ - gy
157
+ - '-9'
158
+ - hy
159
+ - ny
160
+ - '-10'
161
+ - by
162
+ - my
163
+ - '-11'
164
+ - '-12'
165
+ - '-13'
166
+ - py
167
+ - '-14'
168
+ - '-15'
169
+ - v
170
+ - '10'
171
+ - '-16'
172
+ - '-17'
173
+ - '11'
174
+ - '-21'
175
+ - '-20'
176
+ - '12'
177
+ - '-19'
178
+ - '13'
179
+ - '-18'
180
+ - '14'
181
+ - dy
182
+ - '15'
183
+ - ty
184
+ - '-22'
185
+ - '16'
186
+ - '18'
187
+ - '19'
188
+ - '17'
189
+ - <sos/eos>
190
+ odim: null
191
+ model_conf: {}
192
+ use_preprocessor: true
193
+ token_type: phn
194
+ bpemodel: null
195
+ non_linguistic_symbols: null
196
+ cleaner: jaconv
197
+ g2p: pyopenjtalk_accent
198
+ feats_extract: fbank
199
+ feats_extract_conf:
200
+ fs: 24000
201
+ fmin: 80
202
+ fmax: 7600
203
+ n_mels: 80
204
+ hop_length: 300
205
+ n_fft: 2048
206
+ win_length: 1200
207
+ normalize: global_mvn
208
+ normalize_conf:
209
+ stats_file: exp/tts_stats_raw_phn_jaconv_pyopenjtalk_accent/train/feats_stats.npz
210
+ tts: transformer
211
+ tts_conf:
212
+ embed_dim: 0
213
+ eprenet_conv_layers: 0
214
+ eprenet_conv_filts: 0
215
+ eprenet_conv_chans: 0
216
+ dprenet_layers: 2
217
+ dprenet_units: 256
218
+ adim: 512
219
+ aheads: 8
220
+ elayers: 6
221
+ eunits: 1024
222
+ dlayers: 6
223
+ dunits: 1024
224
+ positionwise_layer_type: conv1d
225
+ positionwise_conv_kernel_size: 1
226
+ postnet_layers: 5
227
+ postnet_filts: 5
228
+ postnet_chans: 256
229
+ use_masking: true
230
+ bce_pos_weight: 5.0
231
+ use_scaled_pos_enc: true
232
+ encoder_normalize_before: true
233
+ decoder_normalize_before: true
234
+ reduction_factor: 1
235
+ init_type: xavier_uniform
236
+ init_enc_alpha: 1.0
237
+ init_dec_alpha: 1.0
238
+ eprenet_dropout_rate: 0.0
239
+ dprenet_dropout_rate: 0.5
240
+ postnet_dropout_rate: 0.5
241
+ transformer_enc_dropout_rate: 0.1
242
+ transformer_enc_positional_dropout_rate: 0.1
243
+ transformer_enc_attn_dropout_rate: 0.1
244
+ transformer_dec_dropout_rate: 0.1
245
+ transformer_dec_positional_dropout_rate: 0.1
246
+ transformer_dec_attn_dropout_rate: 0.1
247
+ transformer_enc_dec_attn_dropout_rate: 0.1
248
+ use_guided_attn_loss: true
249
+ num_heads_applied_guided_attn: 2
250
+ num_layers_applied_guided_attn: 2
251
+ modules_applied_guided_attn:
252
+ - encoder-decoder
253
+ guided_attn_loss_sigma: 0.4
254
+ guided_attn_loss_lambda: 10.0
255
+ pitch_extract: null
256
+ pitch_extract_conf: {}
257
+ pitch_normalize: null
258
+ pitch_normalize_conf: {}
259
+ energy_extract: null
260
+ energy_extract_conf: {}
261
+ energy_normalize: null
262
+ energy_normalize_conf: {}
263
+ required:
264
+ - output_dir
265
+ - token_list
266
+ distributed: true
exp/tts_train_transformer_raw_phn_jaconv_pyopenjtalk_accent/images/backward_time.png ADDED
exp/tts_train_transformer_raw_phn_jaconv_pyopenjtalk_accent/images/bce_loss.png ADDED
exp/tts_train_transformer_raw_phn_jaconv_pyopenjtalk_accent/images/decoder_alpha.png ADDED
exp/tts_train_transformer_raw_phn_jaconv_pyopenjtalk_accent/images/enc_dec_attn_loss.png ADDED
exp/tts_train_transformer_raw_phn_jaconv_pyopenjtalk_accent/images/encoder_alpha.png ADDED
exp/tts_train_transformer_raw_phn_jaconv_pyopenjtalk_accent/images/forward_time.png ADDED
exp/tts_train_transformer_raw_phn_jaconv_pyopenjtalk_accent/images/iter_time.png ADDED
exp/tts_train_transformer_raw_phn_jaconv_pyopenjtalk_accent/images/l1_loss.png ADDED
exp/tts_train_transformer_raw_phn_jaconv_pyopenjtalk_accent/images/l2_loss.png ADDED
exp/tts_train_transformer_raw_phn_jaconv_pyopenjtalk_accent/images/loss.png ADDED
exp/tts_train_transformer_raw_phn_jaconv_pyopenjtalk_accent/images/lr_0.png ADDED
exp/tts_train_transformer_raw_phn_jaconv_pyopenjtalk_accent/images/optim_step_time.png ADDED
exp/tts_train_transformer_raw_phn_jaconv_pyopenjtalk_accent/images/train_time.png ADDED
exp/tts_train_transformer_raw_phn_jaconv_pyopenjtalk_accent/train.loss.ave_5best.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59cee4de7466bbd2594e70169161d9bd75068f938555600296251036fa91fce2
3
+ size 132556553
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: 0.8.0
2
+ files:
3
+ model_file: exp/tts_train_transformer_raw_phn_jaconv_pyopenjtalk_accent/train.loss.ave_5best.pth
4
+ python: "3.7.3 (default, Mar 27 2019, 22:11:17) \n[GCC 7.3.0]"
5
+ timestamp: 1608519822.145348
6
+ torch: 1.5.1
7
+ yaml_files:
8
+ train_config: exp/tts_train_transformer_raw_phn_jaconv_pyopenjtalk_accent/config.yaml