|
|
|
|
|
|
|
|
|
|
|
|
|
lexicon: |
|
- "AA" |
|
- "AE" |
|
- "AH" |
|
- "AO" |
|
- "AW" |
|
- "AY" |
|
- "B" |
|
- "CH" |
|
- "D" |
|
- "DH" |
|
- "EH" |
|
- "ER" |
|
- "EY" |
|
- "F" |
|
- "G" |
|
- "HH" |
|
- "IH" |
|
- "IY" |
|
- "JH" |
|
- "K" |
|
- "L" |
|
- "M" |
|
- "N" |
|
- "NG" |
|
- "OW" |
|
- "OY" |
|
- "P" |
|
- "R" |
|
- "S" |
|
- "SH" |
|
- "T" |
|
- "TH" |
|
- "UH" |
|
- "UW" |
|
- "V" |
|
- "W" |
|
- "Y" |
|
- "Z" |
|
- "ZH" |
|
- "-" |
|
- "!" |
|
- "'" |
|
- "(" |
|
- ")" |
|
- "," |
|
- "." |
|
- ":" |
|
- ";" |
|
- "?" |
|
- " " |
|
|
|
n_symbols: 52 |
|
padding_idx: 0 |
|
n_mel_channels: 80 |
|
|
|
hidden_channels: 512 |
|
|
|
|
|
enc_num_layers: 4 |
|
enc_num_head: 2 |
|
enc_d_model: !ref <hidden_channels> |
|
enc_ffn_dim: 1024 |
|
enc_k_dim: !ref <hidden_channels> |
|
enc_v_dim: !ref <hidden_channels> |
|
enc_dropout: 0.2 |
|
|
|
|
|
in_query_channels: 80 |
|
in_key_channels: !ref <hidden_channels> |
|
attn_channels: 80 |
|
temperature: 0.0005 |
|
|
|
|
|
dec_num_layers: 4 |
|
dec_num_head: 2 |
|
dec_d_model: !ref <hidden_channels> |
|
dec_ffn_dim: 1024 |
|
dec_k_dim: !ref <hidden_channels> |
|
dec_v_dim: !ref <hidden_channels> |
|
dec_dropout: 0.2 |
|
|
|
|
|
postnet_embedding_dim: 512 |
|
postnet_kernel_size: 5 |
|
postnet_n_convolutions: 5 |
|
postnet_dropout: 0.2 |
|
|
|
|
|
normalize_before: True |
|
ffn_type: 1dcnn |
|
ffn_cnn_kernel_size_list: [9, 1] |
|
|
|
|
|
dur_pred_kernel_size: 3 |
|
pitch_pred_kernel_size: 3 |
|
energy_pred_kernel_size: 3 |
|
variance_predictor_dropout: 0.5 |
|
|
|
|
|
model: !new:speechbrain.lobes.models.FastSpeech2.FastSpeech2WithAlignment |
|
enc_num_layers: !ref <enc_num_layers> |
|
enc_num_head: !ref <enc_num_head> |
|
enc_d_model: !ref <enc_d_model> |
|
enc_ffn_dim: !ref <enc_ffn_dim> |
|
enc_k_dim: !ref <enc_k_dim> |
|
enc_v_dim: !ref <enc_v_dim> |
|
enc_dropout: !ref <enc_dropout> |
|
in_query_channels: !ref <in_query_channels> |
|
in_key_channels: !ref <in_key_channels> |
|
attn_channels: !ref <attn_channels> |
|
temperature: !ref <temperature> |
|
dec_num_layers: !ref <dec_num_layers> |
|
dec_num_head: !ref <dec_num_head> |
|
dec_d_model: !ref <dec_d_model> |
|
dec_ffn_dim: !ref <dec_ffn_dim> |
|
dec_k_dim: !ref <dec_k_dim> |
|
dec_v_dim: !ref <dec_v_dim> |
|
dec_dropout: !ref <dec_dropout> |
|
normalize_before: !ref <normalize_before> |
|
ffn_type: !ref <ffn_type> |
|
ffn_cnn_kernel_size_list: !ref <ffn_cnn_kernel_size_list> |
|
n_char: !ref <n_symbols> |
|
n_mels: !ref <n_mel_channels> |
|
postnet_embedding_dim: !ref <postnet_embedding_dim> |
|
postnet_kernel_size: !ref <postnet_kernel_size> |
|
postnet_n_convolutions: !ref <postnet_n_convolutions> |
|
postnet_dropout: !ref <postnet_dropout> |
|
padding_idx: !ref <padding_idx> |
|
dur_pred_kernel_size: !ref <dur_pred_kernel_size> |
|
pitch_pred_kernel_size: !ref <pitch_pred_kernel_size> |
|
energy_pred_kernel_size: !ref <energy_pred_kernel_size> |
|
variance_predictor_dropout: !ref <variance_predictor_dropout> |
|
|
|
input_encoder: !new:speechbrain.dataio.encoder.TextEncoder |
|
|
|
modules: |
|
model: !ref <model> |
|
|
|
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer |
|
loadables: |
|
model: !ref <model> |