File size: 2,197 Bytes

c8c5132
65de50d
 
 
e87343a
65de50d
e87343a
65de50d
e87343a
 
 
 
65de50d
 
 
 
e87343a
 
 
 
 
 
 
 
 
 
65de50d
 
e87343a
65de50d
 
 
 
e87343a
65de50d
 
e87343a
65de50d
 
 
e87343a
65de50d
 
e87343a
65de50d
 
e87343a
65de50d
 
 
 
 
 
 
e87343a
65de50d
 
 
 
e87343a
65de50d
 
e87343a
 
65de50d
e87343a
65de50d

# ################################
# Model: Tacotroon2 for TTS
# Authors: Artem Ploujnikov, Yingzhi Wang 
# ################################

mask_padding: True
n_mel_channels: 80
n_symbols: 148
symbols_embedding_dim: 512
encoder_kernel_size: 5
encoder_n_convolutions: 3
encoder_embedding_dim: 512
attention_rnn_dim: 1024
attention_dim: 128
attention_location_n_filters: 32
attention_location_kernel_size: 31
n_frames_per_step: 1
decoder_rnn_dim: 1024
prenet_dim: 256
max_decoder_steps: 1000
gate_threshold: 0.5
p_attention_dropout: 0.1
p_decoder_dropout: 0.1
postnet_embedding_dim: 512
postnet_kernel_size: 5
postnet_n_convolutions: 5
decoder_no_early_stopping: False
sample_rate: 22050

# Model
model: !new:speechbrain.lobes.models.Tacotron2.Tacotron2
  mask_padding: !ref <mask_padding>
  n_mel_channels: !ref <n_mel_channels>
  # symbols
  n_symbols: !ref <n_symbols>
  symbols_embedding_dim: !ref <symbols_embedding_dim>
  # encoder
  encoder_kernel_size: !ref <encoder_kernel_size>
  encoder_n_convolutions: !ref <encoder_n_convolutions>
  encoder_embedding_dim: !ref <encoder_embedding_dim>
  # attention
  attention_rnn_dim: !ref <attention_rnn_dim>
  attention_dim: !ref <attention_dim>
  # attention location
  attention_location_n_filters: !ref <attention_location_n_filters>
  attention_location_kernel_size: !ref <attention_location_kernel_size>
  # decoder
  n_frames_per_step: !ref <n_frames_per_step>
  decoder_rnn_dim: !ref <decoder_rnn_dim>
  prenet_dim: !ref <prenet_dim>
  max_decoder_steps: !ref <max_decoder_steps>
  gate_threshold: !ref <gate_threshold>
  p_attention_dropout: !ref <p_attention_dropout>
  p_decoder_dropout: !ref <p_decoder_dropout>
  # postnet
  postnet_embedding_dim: !ref <postnet_embedding_dim>
  postnet_kernel_size: !ref <postnet_kernel_size>
  postnet_n_convolutions: !ref <postnet_n_convolutions>
  decoder_no_early_stopping: !ref <decoder_no_early_stopping>

# Function that converts the text into a sequence of valid characters.
text_to_sequence: !name:speechbrain.utils.text_to_sequence.text_to_sequence

modules:
    model: !ref <model>

pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
    loadables:
        model: !ref <model>