File size: 3,357 Bytes
122f8f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# ################################
# Model: Fastspeech2 Internal Alignment
# Authors: Yingzhi Wang
# ################################

# Input parameters
lexicon:
    - "AA"
    - "AE"
    - "AH"
    - "AO"
    - "AW"
    - "AY"
    - "B"
    - "CH"
    - "D"
    - "DH"
    - "EH"
    - "ER"
    - "EY"
    - "F"
    - "G"
    - "HH"
    - "IH"
    - "IY"
    - "JH"
    - "K"
    - "L"
    - "M"
    - "N"
    - "NG"
    - "OW"
    - "OY"
    - "P"
    - "R"
    - "S"
    - "SH"
    - "T"
    - "TH"
    - "UH"
    - "UW"
    - "V"
    - "W"
    - "Y"
    - "Z"
    - "ZH"
    - "-"
    - "!"
    - "'"
    - "("
    - ")"
    - ","
    - "."
    - ":"
    - ";"
    - "?"
    - " "

n_symbols: 52 #fixed depending on symbols in the lexicon (+1 for a dummy symbol used for padding, +1 for unknown)
padding_idx: 0
n_mel_channels: 80

hidden_channels: 512

# Encoder parameters
enc_num_layers: 4
enc_num_head: 2
enc_d_model: !ref <hidden_channels>
enc_ffn_dim: 1024
enc_k_dim: !ref <hidden_channels>
enc_v_dim: !ref <hidden_channels>
enc_dropout: 0.2

# Aligner parameters
in_query_channels: 80
in_key_channels: !ref <hidden_channels>
attn_channels: 80
temperature: 0.0005

# Decoder parameters
dec_num_layers: 4
dec_num_head: 2
dec_d_model: !ref <hidden_channels>
dec_ffn_dim: 1024
dec_k_dim: !ref <hidden_channels>
dec_v_dim: !ref <hidden_channels>
dec_dropout: 0.2

# Postnet parameters
postnet_embedding_dim: 512
postnet_kernel_size: 5
postnet_n_convolutions: 5
postnet_dropout: 0.2

# Common
normalize_before: True
ffn_type: 1dcnn #1dcnn or ffn
ffn_cnn_kernel_size_list: [9, 1]

# Variance predictor
dur_pred_kernel_size: 3
pitch_pred_kernel_size: 3
energy_pred_kernel_size: 3
variance_predictor_dropout: 0.5

# Model
model: !new:speechbrain.lobes.models.FastSpeech2.FastSpeech2WithAlignment
    enc_num_layers: !ref <enc_num_layers>
    enc_num_head: !ref <enc_num_head>
    enc_d_model: !ref <enc_d_model>
    enc_ffn_dim: !ref <enc_ffn_dim>
    enc_k_dim: !ref <enc_k_dim>
    enc_v_dim: !ref <enc_v_dim>
    enc_dropout: !ref <enc_dropout>
    in_query_channels: !ref <in_query_channels>
    in_key_channels: !ref <in_key_channels>
    attn_channels: !ref <attn_channels>
    temperature: !ref <temperature>
    dec_num_layers: !ref <dec_num_layers>
    dec_num_head: !ref <dec_num_head>
    dec_d_model: !ref <dec_d_model>
    dec_ffn_dim: !ref <dec_ffn_dim>
    dec_k_dim: !ref <dec_k_dim>
    dec_v_dim: !ref <dec_v_dim>
    dec_dropout: !ref <dec_dropout>
    normalize_before: !ref <normalize_before>
    ffn_type: !ref <ffn_type>
    ffn_cnn_kernel_size_list: !ref <ffn_cnn_kernel_size_list>
    n_char: !ref <n_symbols>
    n_mels: !ref <n_mel_channels>
    postnet_embedding_dim: !ref <postnet_embedding_dim>
    postnet_kernel_size: !ref <postnet_kernel_size>
    postnet_n_convolutions: !ref <postnet_n_convolutions>
    postnet_dropout: !ref <postnet_dropout>
    padding_idx: !ref <padding_idx>
    dur_pred_kernel_size: !ref <dur_pred_kernel_size>
    pitch_pred_kernel_size: !ref <pitch_pred_kernel_size>
    energy_pred_kernel_size: !ref <energy_pred_kernel_size>
    variance_predictor_dropout: !ref <variance_predictor_dropout>

input_encoder: !new:speechbrain.dataio.encoder.TextEncoder

modules:
    model: !ref <model>

pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
    loadables:
        model: !ref <model>