Added Model

Browse files

Files changed (9) hide show

.gitattributes +1 -0
README.md +204 -0
config.yml +142 -0
events.out.tfevents.1722593892.s44504-focus-slate.255003.0.v2 +3 -0
events.out.tfevents.1722597215.s44504-focus-slate.435226.0.v2 +3 -0
mbmelgan.onnx +3 -0
mbmelgan.ort +3 -0
mbmelgan.tflite +3 -0
model.h5 +3 -0

.gitattributes CHANGED Viewed

@@ -14,6 +14,7 @@
 *.npy filter=lfs diff=lfs merge=lfs -text
 *.npz filter=lfs diff=lfs merge=lfs -text
 *.onnx filter=lfs diff=lfs merge=lfs -text
 *.ot filter=lfs diff=lfs merge=lfs -text
 *.parquet filter=lfs diff=lfs merge=lfs -text
 *.pb filter=lfs diff=lfs merge=lfs -text

 *.npy filter=lfs diff=lfs merge=lfs -text
 *.npz filter=lfs diff=lfs merge=lfs -text
 *.onnx filter=lfs diff=lfs merge=lfs -text
+*.ort filter=lfs diff=lfs merge=lfs -text
 *.ot filter=lfs diff=lfs merge=lfs -text
 *.parquet filter=lfs diff=lfs merge=lfs -text
 *.pb filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,3 +1,207 @@
 ---
 license: apache-2.0
 ---

 ---
+language: sw
 license: apache-2.0
+tags:
+  - tensorflowtts
+  - audio
+  - text-to-speech
+  - mel-to-wav
+inference: false
+datasets:
+  - bookbot/sw-TZ-Victoria
+  - bookbot/sw-TZ-Victoria-syllables
+  - bookbot/sw-TZ-Victoria-v2
+  - bookbot/sw-TZ-VictoriaNeural
 ---
+# MB-MelGAN HiFi PostNets SW v4
+MB-MelGAN HiFi PostNets SW v4 is a mel-to-wav model based on the [MB-MelGAN](https://arxiv.org/abs/2005.05106) architecture with [HiFi-GAN](https://arxiv.org/abs/2010.05646) discriminator. This model was trained from scratch on trained on real and synthetic audio datasets. Instead of training on ground truth waveform spectrograms, this model was trained on the generated PostNet spectrograms of [LightSpeech MFA SW v4](https://huggingface.co/bookbot/lightspeech-mfa-sw-v4). The list of speakers include:
+- sw-TZ-Victoria
+- sw-TZ-Victoria-syllables
+- sw-TZ-Victoria-v2
+- sw-TZ-VictoriaNeural
+This model was trained using the [TensorFlowTTS](https://github.com/TensorSpeech/TensorFlowTTS) framework. All training was done on a RTX 4090 GPU. All necessary scripts used for training could be found in this [Github Fork](https://github.com/bookbot-hive/TensorFlowTTS), as well as the [Training metrics](https://huggingface.co/bookbot/mb-melgan-hifi-postnets-sw-v4/tensorboard) logged via Tensorboard.
+## Model
+| Model                           | Config                                                                                    | SR (Hz) | Mel range (Hz) | FFT / Hop / Win (pt) | #steps |
+| ------------------------------- | ----------------------------------------------------------------------------------------- | ------- | -------------- | -------------------- | ------ |
+| `mb-melgan-hifi-postnets-sw-v4` | [Link](https://huggingface.co/bookbot/mb-melgan-hifi-postnets-sw-v4/blob/main/config.yml) | 44.1K   | 20-11025       | 2048 / 512 / None    | 1M     |
+## Training Procedure
+<details>
+  <summary>Feature Extraction Setting</summary>
+    sampling_rate: 44100
+    hop_size: 512 # Hop size.
+    format: "npy"
+</details>
+<details>
+  <summary>Generator Network Architecture Setting</summary>
+    model_type: "multiband_melgan_generator"
+    multiband_melgan_generator_params:
+        out_channels: 4 # Number of output channels (number of subbands).
+        kernel_size: 7 # Kernel size of initial and final conv layers.
+        filters: 384 # Initial number of channels for conv layers.
+        upsample_scales: [8, 4, 4] # List of Upsampling scales.
+        stack_kernel_size: 3 # Kernel size of dilated conv layers in residual stack.
+        stacks: 4 # Number of stacks in a single residual stack module.
+        is_weight_norm: false # Use weight-norm or not.
+</details>
+<details>
+  <summary>Discriminator Network Architecture Setting</summary>
+    multiband_melgan_discriminator_params:
+        out_channels: 1 # Number of output channels.
+        scales: 3 # Number of multi-scales.
+        downsample_pooling: "AveragePooling1D" # Pooling type for the input downsampling.
+        downsample_pooling_params: # Parameters of the above pooling function.
+            pool_size: 4
+            strides: 2
+        kernel_sizes: [5, 3] # List of kernel size.
+        filters: 16 # Number of channels of the initial conv layer.
+        max_downsample_filters: 512 # Maximum number of channels of downsampling layers.
+        downsample_scales: [4, 4, 4] # List of downsampling scales.
+        nonlinear_activation: "LeakyReLU" # Nonlinear activation function.
+        nonlinear_activation_params: # Parameters of nonlinear activation function.
+            alpha: 0.2
+        is_weight_norm: false # Use weight-norm or not.
+    hifigan_discriminator_params:
+        out_channels: 1 # Number of output channels (number of subbands).
+        period_scales: [3, 5, 7, 11, 17, 23, 37] # List of period scales.
+        n_layers: 5 # Number of layer of each period discriminator.
+        kernel_size: 5 # Kernel size.
+        strides: 3 # Strides
+        filters: 8 # In Conv filters of each period discriminator
+        filter_scales: 4 # Filter scales.
+        max_filters: 512 # maximum filters of period discriminator's conv.
+        is_weight_norm: false # Use weight-norm or not.
+</details>
+<details>
+  <summary>STFT Loss Setting</summary>
+    stft_loss_params:
+        fft_lengths: [1024, 2048, 512] # List of FFT size for STFT-based loss.
+        frame_steps: [120, 240, 50] # List of hop size for STFT-based loss
+        frame_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    subband_stft_loss_params:
+        fft_lengths: [384, 683, 171] # List of FFT size for STFT-based loss.
+        frame_steps: [30, 60, 10] # List of hop size for STFT-based loss
+        frame_lengths: [150, 300, 60] # List of window length for STFT-based loss.
+</details>
+<details>
+  <summary>Adversarial Loss Setting</summary>
+    lambda_feat_match: 10.0 # Loss balancing coefficient for feature matching loss
+    lambda_adv: 2.5 # Loss balancing coefficient for adversarial loss.
+</details>
+<details>
+  <summary>Data Loader Setting</summary>
+    batch_size: 32 # Batch size for each GPU with assuming that gradient_accumulation_steps == 1.
+    eval_batch_size: 16
+    batch_max_steps: 8192 # Length of each audio in batch for training. Make sure dividable by hop_size.
+    batch_max_steps_valid: 8192 # Length of each audio for validation. Make sure dividable by hope_size.
+    remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
+    allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory.
+    is_shuffle: true # shuffle dataset after each epoch.
+</details>
+<details>
+  <summary>Optimizer & Scheduler Setting</summary>
+    generator_optimizer_params:
+        lr_fn: "PiecewiseConstantDecay"
+        lr_params:
+            boundaries: [100000, 150000, 400000, 500000, 600000, 700000]
+            values: [0.0005, 0.00025, 0.000125, 0.0000625, 0.00003125, 0.000015625, 0.000001]
+        amsgrad: false
+    discriminator_optimizer_params:
+        lr_fn: "PiecewiseConstantDecay"
+        lr_params:
+            boundaries: [100000, 200000, 300000, 400000, 500000]
+            values: [0.00025, 0.000125, 0.0000625, 0.00003125, 0.000015625, 0.000001]
+        amsgrad: false
+    gradient_accumulation_steps: 1
+</details>
+<details>
+  <summary>Interval Setting</summary>
+    discriminator_train_start_steps: 200000 # steps begin training discriminator
+    train_max_steps: 1000000 # Number of training steps.
+    save_interval_steps: 20000 # Interval steps to save checkpoint.
+    eval_interval_steps: 5000 # Interval steps to evaluate the network.
+    log_interval_steps: 200 # Interval steps to record the training log.
+</details>
+<details>
+  <summary>Other Setting</summary>
+    num_save_intermediate_results: 1 # Number of batch to be saved as intermediate results.
+</details>
+## How to Use
+```py
+import soundfile as sf
+import tensorflow as tf
+from tensorflow_tts.inference import TFAutoModel, AutoProcessor
+lightspeech = TFAutoModel.from_pretrained("bookbot/lightspeech-mfa-sw-v4")
+processor = AutoProcessor.from_pretrained("bookbot/lightspeech-mfa-sw-v4")
+mb_melgan = TFAutoModel.from_pretrained("bookbot/mb-melgan-hifi-postnets-sw-v4")
+text, speaker_name = "Hello World.", "sw-TZ-Victoria"
+input_ids = processor.text_to_sequence(text)
+mel, _, _ = lightspeech.inference(
+    input_ids=tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0),
+    speaker_ids=tf.convert_to_tensor(
+        [processor.speakers_map[speaker_name]], dtype=tf.int32
+    ),
+    speed_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
+    f0_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
+    energy_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
+)
+audio = mb_melgan.inference(mel)[0, :, 0]
+sf.write("./audio.wav", audio, 44100, "PCM_16")
+```
+## Disclaimer
+Do consider the biases which came from pre-training datasets that may be carried over into the results of this model.
+## Authors
+MB-MelGAN HiFi PostNets SW v4 was trained and evaluated by [David Samuel Setiawan](https://davidsamuell.github.io/), [Wilson Wongso](https://wilsonwongso.dev/). All computation and development are done on local machines.
+## Framework versions
+- TensorFlowTTS 1.8
+- TensorFlow 2.12.0

config.yml ADDED Viewed

	@@ -0,0 +1,142 @@

+allow_cache: true
+batch_max_steps: 8192
+batch_max_steps_valid: 8192
+batch_size: 32
+config: ./TensorFlowTTS/examples/multiband_melgan_hf/conf/multiband_melgan_hf.sw.v2.yml
+dev_dir: ./dump/valid/
+discriminator_mixed_precision: false
+discriminator_optimizer_params:
+  amsgrad: false
+  lr_fn: PiecewiseConstantDecay
+  lr_params:
+    boundaries:
+    - 100000
+    - 200000
+    - 300000
+    - 400000
+    - 500000
+    values:
+    - 0.00025
+    - 0.000125
+    - 6.25e-05
+    - 3.125e-05
+    - 1.5625e-05
+    - 1.0e-06
+discriminator_train_start_steps: 200000
+eval_batch_size: 16
+eval_interval_steps: 5000
+format: npy
+generator_mixed_precision: false
+generator_optimizer_params:
+  amsgrad: false
+  lr_fn: PiecewiseConstantDecay
+  lr_params:
+    boundaries:
+    - 100000
+    - 150000
+    - 400000
+    - 500000
+    - 600000
+    - 700000
+    values:
+    - 0.0005
+    - 0.00025
+    - 0.000125
+    - 6.25e-05
+    - 3.125e-05
+    - 1.5625e-05
+    - 1.0e-06
+gradient_accumulation_steps: 1
+hifigan_discriminator_params:
+  filter_scales: 4
+  filters: 8
+  is_weight_norm: false
+  kernel_size: 5
+  max_filters: 512
+  n_layers: 5
+  out_channels: 1
+  period_scales:
+  - 3
+  - 5
+  - 7
+  - 11
+  - 17
+  - 23
+  - 37
+  strides: 3
+hop_size: 512
+is_shuffle: true
+lambda_adv: 2.5
+lambda_feat_match: 10.0
+log_interval_steps: 200
+model_type: multiband_melgan_generator
+multiband_melgan_discriminator_params:
+  downsample_pooling: AveragePooling1D
+  downsample_pooling_params:
+    pool_size: 4
+    strides: 2
+  downsample_scales:
+  - 4
+  - 4
+  - 4
+  filters: 16
+  is_weight_norm: false
+  kernel_sizes:
+  - 5
+  - 3
+  max_downsample_filters: 512
+  nonlinear_activation: LeakyReLU
+  nonlinear_activation_params:
+    alpha: 0.2
+  out_channels: 1
+  scales: 3
+multiband_melgan_generator_params:
+  filters: 384
+  is_weight_norm: false
+  kernel_size: 7
+  out_channels: 4
+  stack_kernel_size: 3
+  stacks: 4
+  upsample_scales:
+  - 8
+  - 4
+  - 4
+num_save_intermediate_results: 1
+outdir: ./mb-melgan-hifi-sw-tz-victoria-ft-vocab-exp-synth-v2/
+postnets: true
+pretrained: ''
+remove_short_samples: true
+resume: ./mb-melgan-hifi-sw-tz-victoria-ft-vocab-exp-synth-v2/checkpoints/ckpt-200000
+sampling_rate: 44100
+save_interval_steps: 20000
+stft_loss_params:
+  fft_lengths:
+  - 1024
+  - 2048
+  - 512
+  frame_lengths:
+  - 600
+  - 1200
+  - 240
+  frame_steps:
+  - 120
+  - 240
+  - 50
+subband_stft_loss_params:
+  fft_lengths:
+  - 384
+  - 683
+  - 171
+  frame_lengths:
+  - 150
+  - 300
+  - 60
+  frame_steps:
+  - 30
+  - 60
+  - 10
+train_dir: ./dump/train/
+train_max_steps: 1000000
+use_norm: true
+verbose: 1
+version: '0.0'

events.out.tfevents.1722593892.s44504-focus-slate.255003.0.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:40cd11bdd2d61fda7e9978b36bd4120e6e2fc0ca0122f8bcab3dfc1479a5b385
+size 793522

events.out.tfevents.1722597215.s44504-focus-slate.435226.0.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:58a6f6fc3d0f3562edceea5f5428b85810d56d0bc6f845aa53b35b4a8ecf76e4
+size 3176878

mbmelgan.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3514c3bd67bb793b8a823703a75bfbe7ff54b86214fe740da7db4b3eea89d22d
+size 10453995

mbmelgan.ort ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:15ac3826ceff45f07885ff1efc3c67a313403f01dda7b7fd4c8e4031f0e6e3e9
+size 10670944

mbmelgan.tflite ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:913a9d404d765218a1bf5ce76c5fc1208ebf0b04cf273e70b1dd4f2f592a3cbf
+size 5298364

model.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8fc53b85b4b3df83d664942042f3fa0df47704f5d6f33055721ae6a801718f7d
+size 10308488