speechbrain
English
Tacotron2
zero-shot
multi-speaker-tts
pradnya-hf-dev commited on
Commit
b4c5f17
1 Parent(s): 1e5df9d

Upload 2 files

Browse files
Files changed (2) hide show
  1. hyperparams.yaml +120 -0
  2. model.ckpt +3 -0
hyperparams.yaml ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ################################
2
+ # Audio Parameters #
3
+ ################################
4
+ sample_rate: 22050
5
+ hop_length: 256
6
+ win_length: 1024
7
+ n_mel_channels: 80
8
+ n_fft: 1024
9
+ mel_fmin: 0.0
10
+ mel_fmax: 8000.0
11
+ mel_normalized: False
12
+ power: 1
13
+ norm: "slaney"
14
+ mel_scale: "slaney"
15
+ dynamic_range_compression: True
16
+
17
+ ################################
18
+ # Speaker Embedding Parameters #
19
+ ################################
20
+
21
+ spk_emb_size: 192
22
+ spk_emb_sample_rate: 16000
23
+ custom_mel_spec_encoder: True
24
+ spk_emb_encoder: speechbrain/spkrec-ecapa-voxceleb-mel-spec
25
+
26
+ ################################
27
+ # Optimization Hyperparameters #
28
+ ################################
29
+ mask_padding: True
30
+
31
+
32
+ ################################
33
+ # Model Parameters and model #
34
+ ################################
35
+ n_symbols: 148 #fixed depending on symbols in textToSequence
36
+ symbols_embedding_dim: 1024
37
+
38
+ # Encoder parameters
39
+ encoder_kernel_size: 5
40
+ encoder_n_convolutions: 6
41
+ encoder_embedding_dim: 1024
42
+
43
+ # Decoder parameters
44
+ # The number of frames in the target per encoder step
45
+ n_frames_per_step: 1
46
+ decoder_rnn_dim: 2048
47
+ prenet_dim: 512
48
+ max_decoder_steps: 1500
49
+ gate_threshold: 0.5
50
+ p_attention_dropout: 0.1
51
+ p_decoder_dropout: 0.1
52
+ decoder_no_early_stopping: False
53
+
54
+ # Attention parameters
55
+ attention_rnn_dim: 2048
56
+ attention_dim: 256
57
+
58
+ # Location Layer parameters
59
+ attention_location_n_filters: 32
60
+ attention_location_kernel_size: 31
61
+
62
+ # Mel-post processing network parameters
63
+ postnet_embedding_dim: 1024
64
+ postnet_kernel_size: 5
65
+ postnet_n_convolutions: 10
66
+
67
+ mel_spectogram: !name:speechbrain.lobes.models.Tacotron2.mel_spectogram
68
+ sample_rate: !ref <sample_rate>
69
+ hop_length: !ref <hop_length>
70
+ win_length: !ref <win_length>
71
+ n_fft: !ref <n_fft>
72
+ n_mels: !ref <n_mel_channels>
73
+ f_min: !ref <mel_fmin>
74
+ f_max: !ref <mel_fmax>
75
+ power: !ref <power>
76
+ normalized: !ref <mel_normalized>
77
+ norm: !ref <norm>
78
+ mel_scale: !ref <mel_scale>
79
+ compression: !ref <dynamic_range_compression>
80
+
81
+ #model
82
+ model: !new:speechbrain.lobes.models.MSTacotron2.Tacotron2
83
+ mask_padding: !ref <mask_padding>
84
+ n_mel_channels: !ref <n_mel_channels>
85
+ # symbols
86
+ n_symbols: !ref <n_symbols>
87
+ symbols_embedding_dim: !ref <symbols_embedding_dim>
88
+ # encoder
89
+ encoder_kernel_size: !ref <encoder_kernel_size>
90
+ encoder_n_convolutions: !ref <encoder_n_convolutions>
91
+ encoder_embedding_dim: !ref <encoder_embedding_dim>
92
+ # attention
93
+ attention_rnn_dim: !ref <attention_rnn_dim>
94
+ attention_dim: !ref <attention_dim>
95
+ # attention location
96
+ attention_location_n_filters: !ref <attention_location_n_filters>
97
+ attention_location_kernel_size: !ref <attention_location_kernel_size>
98
+ # decoder
99
+ n_frames_per_step: !ref <n_frames_per_step>
100
+ decoder_rnn_dim: !ref <decoder_rnn_dim>
101
+ prenet_dim: !ref <prenet_dim>
102
+ max_decoder_steps: !ref <max_decoder_steps>
103
+ gate_threshold: !ref <gate_threshold>
104
+ p_attention_dropout: !ref <p_attention_dropout>
105
+ p_decoder_dropout: !ref <p_decoder_dropout>
106
+ # postnet
107
+ postnet_embedding_dim: !ref <postnet_embedding_dim>
108
+ postnet_kernel_size: !ref <postnet_kernel_size>
109
+ postnet_n_convolutions: !ref <postnet_n_convolutions>
110
+ decoder_no_early_stopping: !ref <decoder_no_early_stopping>
111
+ # speaker embeddings
112
+ spk_emb_size: !ref <spk_emb_size>
113
+
114
+
115
+ modules:
116
+ model: !ref <model>
117
+
118
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
119
+ loadables:
120
+ model: !ref <model>
model.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:294206da88347ea113f1de17bf23eac2cc8a910d8284b5b8b67ead241cddbfd5
3
+ size 619239275