File size: 3,357 Bytes
122f8f6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
# ################################
# Model: Fastspeech2 Internal Alignment
# Authors: Yingzhi Wang
# ################################
# Input parameters
lexicon:
- "AA"
- "AE"
- "AH"
- "AO"
- "AW"
- "AY"
- "B"
- "CH"
- "D"
- "DH"
- "EH"
- "ER"
- "EY"
- "F"
- "G"
- "HH"
- "IH"
- "IY"
- "JH"
- "K"
- "L"
- "M"
- "N"
- "NG"
- "OW"
- "OY"
- "P"
- "R"
- "S"
- "SH"
- "T"
- "TH"
- "UH"
- "UW"
- "V"
- "W"
- "Y"
- "Z"
- "ZH"
- "-"
- "!"
- "'"
- "("
- ")"
- ","
- "."
- ":"
- ";"
- "?"
- " "
n_symbols: 52 #fixed depending on symbols in the lexicon (+1 for a dummy symbol used for padding, +1 for unknown)
padding_idx: 0
n_mel_channels: 80
hidden_channels: 512
# Encoder parameters
enc_num_layers: 4
enc_num_head: 2
enc_d_model: !ref <hidden_channels>
enc_ffn_dim: 1024
enc_k_dim: !ref <hidden_channels>
enc_v_dim: !ref <hidden_channels>
enc_dropout: 0.2
# Aligner parameters
in_query_channels: 80
in_key_channels: !ref <hidden_channels>
attn_channels: 80
temperature: 0.0005
# Decoder parameters
dec_num_layers: 4
dec_num_head: 2
dec_d_model: !ref <hidden_channels>
dec_ffn_dim: 1024
dec_k_dim: !ref <hidden_channels>
dec_v_dim: !ref <hidden_channels>
dec_dropout: 0.2
# Postnet parameters
postnet_embedding_dim: 512
postnet_kernel_size: 5
postnet_n_convolutions: 5
postnet_dropout: 0.2
# Common
normalize_before: True
ffn_type: 1dcnn #1dcnn or ffn
ffn_cnn_kernel_size_list: [9, 1]
# Variance predictor
dur_pred_kernel_size: 3
pitch_pred_kernel_size: 3
energy_pred_kernel_size: 3
variance_predictor_dropout: 0.5
# Model
model: !new:speechbrain.lobes.models.FastSpeech2.FastSpeech2WithAlignment
enc_num_layers: !ref <enc_num_layers>
enc_num_head: !ref <enc_num_head>
enc_d_model: !ref <enc_d_model>
enc_ffn_dim: !ref <enc_ffn_dim>
enc_k_dim: !ref <enc_k_dim>
enc_v_dim: !ref <enc_v_dim>
enc_dropout: !ref <enc_dropout>
in_query_channels: !ref <in_query_channels>
in_key_channels: !ref <in_key_channels>
attn_channels: !ref <attn_channels>
temperature: !ref <temperature>
dec_num_layers: !ref <dec_num_layers>
dec_num_head: !ref <dec_num_head>
dec_d_model: !ref <dec_d_model>
dec_ffn_dim: !ref <dec_ffn_dim>
dec_k_dim: !ref <dec_k_dim>
dec_v_dim: !ref <dec_v_dim>
dec_dropout: !ref <dec_dropout>
normalize_before: !ref <normalize_before>
ffn_type: !ref <ffn_type>
ffn_cnn_kernel_size_list: !ref <ffn_cnn_kernel_size_list>
n_char: !ref <n_symbols>
n_mels: !ref <n_mel_channels>
postnet_embedding_dim: !ref <postnet_embedding_dim>
postnet_kernel_size: !ref <postnet_kernel_size>
postnet_n_convolutions: !ref <postnet_n_convolutions>
postnet_dropout: !ref <postnet_dropout>
padding_idx: !ref <padding_idx>
dur_pred_kernel_size: !ref <dur_pred_kernel_size>
pitch_pred_kernel_size: !ref <pitch_pred_kernel_size>
energy_pred_kernel_size: !ref <energy_pred_kernel_size>
variance_predictor_dropout: !ref <variance_predictor_dropout>
input_encoder: !new:speechbrain.dataio.encoder.TextEncoder
modules:
model: !ref <model>
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
loadables:
model: !ref <model> |