w11wo commited on
Commit
fdb9167
1 Parent(s): 2a0d31e

Added Model

Browse files
.gitattributes CHANGED
@@ -14,6 +14,7 @@
14
  *.npy filter=lfs diff=lfs merge=lfs -text
15
  *.npz filter=lfs diff=lfs merge=lfs -text
16
  *.onnx filter=lfs diff=lfs merge=lfs -text
 
17
  *.ot filter=lfs diff=lfs merge=lfs -text
18
  *.parquet filter=lfs diff=lfs merge=lfs -text
19
  *.pb filter=lfs diff=lfs merge=lfs -text
 
14
  *.npy filter=lfs diff=lfs merge=lfs -text
15
  *.npz filter=lfs diff=lfs merge=lfs -text
16
  *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ort filter=lfs diff=lfs merge=lfs -text
18
  *.ot filter=lfs diff=lfs merge=lfs -text
19
  *.parquet filter=lfs diff=lfs merge=lfs -text
20
  *.pb filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,3 +1,207 @@
1
  ---
 
2
  license: apache-2.0
 
 
 
 
 
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ language: sw
3
  license: apache-2.0
4
+ tags:
5
+ - tensorflowtts
6
+ - audio
7
+ - text-to-speech
8
+ - mel-to-wav
9
+ inference: false
10
+ datasets:
11
+ - bookbot/sw-TZ-Victoria
12
+ - bookbot/sw-TZ-Victoria-syllables
13
+ - bookbot/sw-TZ-Victoria-v2
14
+ - bookbot/sw-TZ-VictoriaNeural
15
  ---
16
+
17
+ # MB-MelGAN HiFi PostNets SW v4
18
+
19
+ MB-MelGAN HiFi PostNets SW v4 is a mel-to-wav model based on the [MB-MelGAN](https://arxiv.org/abs/2005.05106) architecture with [HiFi-GAN](https://arxiv.org/abs/2010.05646) discriminator. This model was trained from scratch on trained on real and synthetic audio datasets. Instead of training on ground truth waveform spectrograms, this model was trained on the generated PostNet spectrograms of [LightSpeech MFA SW v4](https://huggingface.co/bookbot/lightspeech-mfa-sw-v4). The list of speakers include:
20
+
21
+ - sw-TZ-Victoria
22
+ - sw-TZ-Victoria-syllables
23
+ - sw-TZ-Victoria-v2
24
+ - sw-TZ-VictoriaNeural
25
+
26
+ This model was trained using the [TensorFlowTTS](https://github.com/TensorSpeech/TensorFlowTTS) framework. All training was done on a RTX 4090 GPU. All necessary scripts used for training could be found in this [Github Fork](https://github.com/bookbot-hive/TensorFlowTTS), as well as the [Training metrics](https://huggingface.co/bookbot/mb-melgan-hifi-postnets-sw-v4/tensorboard) logged via Tensorboard.
27
+
28
+ ## Model
29
+
30
+ | Model | Config | SR (Hz) | Mel range (Hz) | FFT / Hop / Win (pt) | #steps |
31
+ | ------------------------------- | ----------------------------------------------------------------------------------------- | ------- | -------------- | -------------------- | ------ |
32
+ | `mb-melgan-hifi-postnets-sw-v4` | [Link](https://huggingface.co/bookbot/mb-melgan-hifi-postnets-sw-v4/blob/main/config.yml) | 44.1K | 20-11025 | 2048 / 512 / None | 1M |
33
+
34
+ ## Training Procedure
35
+
36
+ <details>
37
+ <summary>Feature Extraction Setting</summary>
38
+
39
+ sampling_rate: 44100
40
+ hop_size: 512 # Hop size.
41
+ format: "npy"
42
+
43
+ </details>
44
+
45
+ <details>
46
+ <summary>Generator Network Architecture Setting</summary>
47
+
48
+ model_type: "multiband_melgan_generator"
49
+
50
+ multiband_melgan_generator_params:
51
+ out_channels: 4 # Number of output channels (number of subbands).
52
+ kernel_size: 7 # Kernel size of initial and final conv layers.
53
+ filters: 384 # Initial number of channels for conv layers.
54
+ upsample_scales: [8, 4, 4] # List of Upsampling scales.
55
+ stack_kernel_size: 3 # Kernel size of dilated conv layers in residual stack.
56
+ stacks: 4 # Number of stacks in a single residual stack module.
57
+ is_weight_norm: false # Use weight-norm or not.
58
+
59
+ </details>
60
+
61
+ <details>
62
+ <summary>Discriminator Network Architecture Setting</summary>
63
+
64
+ multiband_melgan_discriminator_params:
65
+ out_channels: 1 # Number of output channels.
66
+ scales: 3 # Number of multi-scales.
67
+ downsample_pooling: "AveragePooling1D" # Pooling type for the input downsampling.
68
+ downsample_pooling_params: # Parameters of the above pooling function.
69
+ pool_size: 4
70
+ strides: 2
71
+ kernel_sizes: [5, 3] # List of kernel size.
72
+ filters: 16 # Number of channels of the initial conv layer.
73
+ max_downsample_filters: 512 # Maximum number of channels of downsampling layers.
74
+ downsample_scales: [4, 4, 4] # List of downsampling scales.
75
+ nonlinear_activation: "LeakyReLU" # Nonlinear activation function.
76
+ nonlinear_activation_params: # Parameters of nonlinear activation function.
77
+ alpha: 0.2
78
+ is_weight_norm: false # Use weight-norm or not.
79
+
80
+ hifigan_discriminator_params:
81
+ out_channels: 1 # Number of output channels (number of subbands).
82
+ period_scales: [3, 5, 7, 11, 17, 23, 37] # List of period scales.
83
+ n_layers: 5 # Number of layer of each period discriminator.
84
+ kernel_size: 5 # Kernel size.
85
+ strides: 3 # Strides
86
+ filters: 8 # In Conv filters of each period discriminator
87
+ filter_scales: 4 # Filter scales.
88
+ max_filters: 512 # maximum filters of period discriminator's conv.
89
+ is_weight_norm: false # Use weight-norm or not.
90
+
91
+ </details>
92
+
93
+ <details>
94
+ <summary>STFT Loss Setting</summary>
95
+
96
+ stft_loss_params:
97
+ fft_lengths: [1024, 2048, 512] # List of FFT size for STFT-based loss.
98
+ frame_steps: [120, 240, 50] # List of hop size for STFT-based loss
99
+ frame_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
100
+
101
+ subband_stft_loss_params:
102
+ fft_lengths: [384, 683, 171] # List of FFT size for STFT-based loss.
103
+ frame_steps: [30, 60, 10] # List of hop size for STFT-based loss
104
+ frame_lengths: [150, 300, 60] # List of window length for STFT-based loss.
105
+
106
+ </details>
107
+
108
+ <details>
109
+ <summary>Adversarial Loss Setting</summary>
110
+
111
+ lambda_feat_match: 10.0 # Loss balancing coefficient for feature matching loss
112
+ lambda_adv: 2.5 # Loss balancing coefficient for adversarial loss.
113
+
114
+ </details>
115
+
116
+ <details>
117
+ <summary>Data Loader Setting</summary>
118
+
119
+ batch_size: 32 # Batch size for each GPU with assuming that gradient_accumulation_steps == 1.
120
+ eval_batch_size: 16
121
+ batch_max_steps: 8192 # Length of each audio in batch for training. Make sure dividable by hop_size.
122
+ batch_max_steps_valid: 8192 # Length of each audio for validation. Make sure dividable by hope_size.
123
+ remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
124
+ allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory.
125
+ is_shuffle: true # shuffle dataset after each epoch.
126
+
127
+ </details>
128
+
129
+ <details>
130
+ <summary>Optimizer & Scheduler Setting</summary>
131
+
132
+ generator_optimizer_params:
133
+ lr_fn: "PiecewiseConstantDecay"
134
+ lr_params:
135
+ boundaries: [100000, 150000, 400000, 500000, 600000, 700000]
136
+ values: [0.0005, 0.00025, 0.000125, 0.0000625, 0.00003125, 0.000015625, 0.000001]
137
+ amsgrad: false
138
+
139
+ discriminator_optimizer_params:
140
+ lr_fn: "PiecewiseConstantDecay"
141
+ lr_params:
142
+ boundaries: [100000, 200000, 300000, 400000, 500000]
143
+ values: [0.00025, 0.000125, 0.0000625, 0.00003125, 0.000015625, 0.000001]
144
+ amsgrad: false
145
+
146
+ gradient_accumulation_steps: 1
147
+
148
+ </details>
149
+
150
+ <details>
151
+ <summary>Interval Setting</summary>
152
+
153
+ discriminator_train_start_steps: 200000 # steps begin training discriminator
154
+ train_max_steps: 1000000 # Number of training steps.
155
+ save_interval_steps: 20000 # Interval steps to save checkpoint.
156
+ eval_interval_steps: 5000 # Interval steps to evaluate the network.
157
+ log_interval_steps: 200 # Interval steps to record the training log.
158
+
159
+ </details>
160
+
161
+ <details>
162
+ <summary>Other Setting</summary>
163
+
164
+ num_save_intermediate_results: 1 # Number of batch to be saved as intermediate results.
165
+
166
+ </details>
167
+
168
+ ## How to Use
169
+
170
+ ```py
171
+ import soundfile as sf
172
+ import tensorflow as tf
173
+ from tensorflow_tts.inference import TFAutoModel, AutoProcessor
174
+
175
+ lightspeech = TFAutoModel.from_pretrained("bookbot/lightspeech-mfa-sw-v4")
176
+ processor = AutoProcessor.from_pretrained("bookbot/lightspeech-mfa-sw-v4")
177
+ mb_melgan = TFAutoModel.from_pretrained("bookbot/mb-melgan-hifi-postnets-sw-v4")
178
+
179
+ text, speaker_name = "Hello World.", "sw-TZ-Victoria"
180
+ input_ids = processor.text_to_sequence(text)
181
+
182
+ mel, _, _ = lightspeech.inference(
183
+ input_ids=tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0),
184
+ speaker_ids=tf.convert_to_tensor(
185
+ [processor.speakers_map[speaker_name]], dtype=tf.int32
186
+ ),
187
+ speed_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
188
+ f0_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
189
+ energy_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
190
+ )
191
+
192
+ audio = mb_melgan.inference(mel)[0, :, 0]
193
+ sf.write("./audio.wav", audio, 44100, "PCM_16")
194
+ ```
195
+
196
+ ## Disclaimer
197
+
198
+ Do consider the biases which came from pre-training datasets that may be carried over into the results of this model.
199
+
200
+ ## Authors
201
+
202
+ MB-MelGAN HiFi PostNets SW v4 was trained and evaluated by [David Samuel Setiawan](https://davidsamuell.github.io/), [Wilson Wongso](https://wilsonwongso.dev/). All computation and development are done on local machines.
203
+
204
+ ## Framework versions
205
+
206
+ - TensorFlowTTS 1.8
207
+ - TensorFlow 2.12.0
config.yml ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ allow_cache: true
2
+ batch_max_steps: 8192
3
+ batch_max_steps_valid: 8192
4
+ batch_size: 32
5
+ config: ./TensorFlowTTS/examples/multiband_melgan_hf/conf/multiband_melgan_hf.sw.v2.yml
6
+ dev_dir: ./dump/valid/
7
+ discriminator_mixed_precision: false
8
+ discriminator_optimizer_params:
9
+ amsgrad: false
10
+ lr_fn: PiecewiseConstantDecay
11
+ lr_params:
12
+ boundaries:
13
+ - 100000
14
+ - 200000
15
+ - 300000
16
+ - 400000
17
+ - 500000
18
+ values:
19
+ - 0.00025
20
+ - 0.000125
21
+ - 6.25e-05
22
+ - 3.125e-05
23
+ - 1.5625e-05
24
+ - 1.0e-06
25
+ discriminator_train_start_steps: 200000
26
+ eval_batch_size: 16
27
+ eval_interval_steps: 5000
28
+ format: npy
29
+ generator_mixed_precision: false
30
+ generator_optimizer_params:
31
+ amsgrad: false
32
+ lr_fn: PiecewiseConstantDecay
33
+ lr_params:
34
+ boundaries:
35
+ - 100000
36
+ - 150000
37
+ - 400000
38
+ - 500000
39
+ - 600000
40
+ - 700000
41
+ values:
42
+ - 0.0005
43
+ - 0.00025
44
+ - 0.000125
45
+ - 6.25e-05
46
+ - 3.125e-05
47
+ - 1.5625e-05
48
+ - 1.0e-06
49
+ gradient_accumulation_steps: 1
50
+ hifigan_discriminator_params:
51
+ filter_scales: 4
52
+ filters: 8
53
+ is_weight_norm: false
54
+ kernel_size: 5
55
+ max_filters: 512
56
+ n_layers: 5
57
+ out_channels: 1
58
+ period_scales:
59
+ - 3
60
+ - 5
61
+ - 7
62
+ - 11
63
+ - 17
64
+ - 23
65
+ - 37
66
+ strides: 3
67
+ hop_size: 512
68
+ is_shuffle: true
69
+ lambda_adv: 2.5
70
+ lambda_feat_match: 10.0
71
+ log_interval_steps: 200
72
+ model_type: multiband_melgan_generator
73
+ multiband_melgan_discriminator_params:
74
+ downsample_pooling: AveragePooling1D
75
+ downsample_pooling_params:
76
+ pool_size: 4
77
+ strides: 2
78
+ downsample_scales:
79
+ - 4
80
+ - 4
81
+ - 4
82
+ filters: 16
83
+ is_weight_norm: false
84
+ kernel_sizes:
85
+ - 5
86
+ - 3
87
+ max_downsample_filters: 512
88
+ nonlinear_activation: LeakyReLU
89
+ nonlinear_activation_params:
90
+ alpha: 0.2
91
+ out_channels: 1
92
+ scales: 3
93
+ multiband_melgan_generator_params:
94
+ filters: 384
95
+ is_weight_norm: false
96
+ kernel_size: 7
97
+ out_channels: 4
98
+ stack_kernel_size: 3
99
+ stacks: 4
100
+ upsample_scales:
101
+ - 8
102
+ - 4
103
+ - 4
104
+ num_save_intermediate_results: 1
105
+ outdir: ./mb-melgan-hifi-sw-tz-victoria-ft-vocab-exp-synth-v2/
106
+ postnets: true
107
+ pretrained: ''
108
+ remove_short_samples: true
109
+ resume: ./mb-melgan-hifi-sw-tz-victoria-ft-vocab-exp-synth-v2/checkpoints/ckpt-200000
110
+ sampling_rate: 44100
111
+ save_interval_steps: 20000
112
+ stft_loss_params:
113
+ fft_lengths:
114
+ - 1024
115
+ - 2048
116
+ - 512
117
+ frame_lengths:
118
+ - 600
119
+ - 1200
120
+ - 240
121
+ frame_steps:
122
+ - 120
123
+ - 240
124
+ - 50
125
+ subband_stft_loss_params:
126
+ fft_lengths:
127
+ - 384
128
+ - 683
129
+ - 171
130
+ frame_lengths:
131
+ - 150
132
+ - 300
133
+ - 60
134
+ frame_steps:
135
+ - 30
136
+ - 60
137
+ - 10
138
+ train_dir: ./dump/train/
139
+ train_max_steps: 1000000
140
+ use_norm: true
141
+ verbose: 1
142
+ version: '0.0'
events.out.tfevents.1722593892.s44504-focus-slate.255003.0.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40cd11bdd2d61fda7e9978b36bd4120e6e2fc0ca0122f8bcab3dfc1479a5b385
3
+ size 793522
events.out.tfevents.1722597215.s44504-focus-slate.435226.0.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58a6f6fc3d0f3562edceea5f5428b85810d56d0bc6f845aa53b35b4a8ecf76e4
3
+ size 3176878
mbmelgan.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3514c3bd67bb793b8a823703a75bfbe7ff54b86214fe740da7db4b3eea89d22d
3
+ size 10453995
mbmelgan.ort ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15ac3826ceff45f07885ff1efc3c67a313403f01dda7b7fd4c8e4031f0e6e3e9
3
+ size 10670944
mbmelgan.tflite ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:913a9d404d765218a1bf5ce76c5fc1208ebf0b04cf273e70b1dd4f2f592a3cbf
3
+ size 5298364
model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8fc53b85b4b3df83d664942042f3fa0df47704f5d6f33055721ae6a801718f7d
3
+ size 10308488