AudioGPT

Build error

App Files Files Community

Datasculptor

lmzjms commited on Apr 28, 2023

Commit

98f685a

•

0 Parent(s):

Duplicate from AIGC-Audio/AudioGPT

Browse files

Co-authored-by: Mingze Li <lmzjms@users.noreply.huggingface.co>

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +34 -0
.gitignore +16 -0
NeuralSeq/LICENSE +21 -0
NeuralSeq/README.md +9 -0
NeuralSeq/configs/config_base.yaml +42 -0
NeuralSeq/configs/singing/base.yaml +42 -0
NeuralSeq/configs/singing/fs2.yaml +3 -0
NeuralSeq/configs/tts/base.yaml +95 -0
NeuralSeq/configs/tts/base_zh.yaml +3 -0
NeuralSeq/configs/tts/emotion/base_text2mel.yaml +17 -0
NeuralSeq/configs/tts/emotion/pre_align.py +25 -0
NeuralSeq/configs/tts/fs2.yaml +80 -0
NeuralSeq/configs/tts/hifigan.yaml +21 -0
NeuralSeq/configs/tts/libritts/__pycache__/pre_align.cpython-38.pyc +0 -0
NeuralSeq/configs/tts/libritts/base_text2mel.yaml +14 -0
NeuralSeq/configs/tts/libritts/fs2.yaml +3 -0
NeuralSeq/configs/tts/libritts/pre_align.py +27 -0
NeuralSeq/configs/tts/libritts/pwg.yaml +8 -0
NeuralSeq/configs/tts/lj/base_mel2wav.yaml +3 -0
NeuralSeq/configs/tts/lj/base_text2mel.yaml +13 -0
NeuralSeq/configs/tts/lj/fs2.yaml +3 -0
NeuralSeq/configs/tts/lj/hifigan.yaml +3 -0
NeuralSeq/configs/tts/lj/pwg.yaml +3 -0
NeuralSeq/configs/tts/pwg.yaml +110 -0
NeuralSeq/data_gen/tts/__pycache__/base_binarizer.cpython-38.pyc +0 -0
NeuralSeq/data_gen/tts/__pycache__/base_binarizer_emotion.cpython-38.pyc +0 -0
NeuralSeq/data_gen/tts/__pycache__/base_preprocess.cpython-38.pyc +0 -0
NeuralSeq/data_gen/tts/__pycache__/data_gen_utils.cpython-37.pyc +0 -0
NeuralSeq/data_gen/tts/__pycache__/data_gen_utils.cpython-38.pyc +0 -0
NeuralSeq/data_gen/tts/base_binarizer.py +224 -0
NeuralSeq/data_gen/tts/base_binarizer_emotion.py +352 -0
NeuralSeq/data_gen/tts/base_preprocess.py +254 -0
NeuralSeq/data_gen/tts/binarizer_zh.py +59 -0
NeuralSeq/data_gen/tts/data_gen_utils.py +357 -0
NeuralSeq/data_gen/tts/emotion/__pycache__/audio.cpython-38.pyc +0 -0
NeuralSeq/data_gen/tts/emotion/__pycache__/inference.cpython-38.pyc +0 -0
NeuralSeq/data_gen/tts/emotion/__pycache__/model.cpython-38.pyc +0 -0
NeuralSeq/data_gen/tts/emotion/__pycache__/params_data.cpython-38.pyc +0 -0
NeuralSeq/data_gen/tts/emotion/__pycache__/params_model.cpython-38.pyc +0 -0
NeuralSeq/data_gen/tts/emotion/audio.py +107 -0
NeuralSeq/data_gen/tts/emotion/inference.py +177 -0
NeuralSeq/data_gen/tts/emotion/model.py +78 -0
NeuralSeq/data_gen/tts/emotion/params_data.py +29 -0
NeuralSeq/data_gen/tts/emotion/params_model.py +11 -0
NeuralSeq/data_gen/tts/emotion/test_emotion.py +184 -0
NeuralSeq/data_gen/tts/txt_processors/__init__.py +1 -0
NeuralSeq/data_gen/tts/txt_processors/__pycache__/__init__.cpython-38.pyc +0 -0
NeuralSeq/data_gen/tts/txt_processors/__pycache__/base_text_processor.cpython-38.pyc +0 -0
NeuralSeq/data_gen/tts/txt_processors/__pycache__/en.cpython-38.pyc +0 -0
NeuralSeq/data_gen/tts/txt_processors/base_text_processor.py +47 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,34 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,16 @@

+# JetBrains PyCharm IDE
+.idea/
+.github/
+.circleci/
+# Byte-compiled / optimized / DLL files
+*__pycache__/
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# macOS dir files
+.DS_Store

NeuralSeq/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2021 Jinglin Liu
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

NeuralSeq/README.md ADDED Viewed

	@@ -0,0 +1,9 @@

+---
+title: DiffSinger🎶 Diffusion for Singing Voice Synthesis
+emoji: 🎶
+colorFrom: purple
+colorTo: blue
+sdk: gradio
+app_file: "inference/svs/gradio/infer.py"
+pinned: false
+---

NeuralSeq/configs/config_base.yaml ADDED Viewed

	@@ -0,0 +1,42 @@

+# task
+binary_data_dir: ''
+work_dir: '' # experiment directory.
+infer: false # infer
+seed: 1234
+debug: false
+save_codes:
+  - configs
+  - modules
+  - tasks
+  - utils
+  - usr
+#############
+# dataset
+#############
+ds_workers: 1
+test_num: 100
+valid_num: 100
+endless_ds: false
+sort_by_len: true
+#########
+# train and eval
+#########
+load_ckpt: ''
+save_ckpt: true
+save_best: false
+num_ckpt_keep: 3
+clip_grad_norm: 0
+accumulate_grad_batches: 1
+log_interval: 100
+num_sanity_val_steps: 5  # steps of validation at the beginning
+check_val_every_n_epoch: 10
+val_check_interval: 2000
+max_epochs: 1000
+max_updates: 160000
+max_tokens: 31250
+max_sentences: 100000
+max_eval_tokens: -1
+max_eval_sentences: -1
+test_input_dir: ''

NeuralSeq/configs/singing/base.yaml ADDED Viewed

	@@ -0,0 +1,42 @@

+base_config:
+  - configs/tts/base.yaml
+  - configs/tts/base_zh.yaml
+datasets: []
+test_prefixes: []
+test_num: 0
+valid_num: 0
+pre_align_cls: data_gen.singing.pre_align.SingingPreAlign
+binarizer_cls: data_gen.singing.binarize.SingingBinarizer
+pre_align_args:
+  use_tone: false # for ZH
+  forced_align: mfa
+  use_sox: true
+hop_size: 128            # Hop size.
+fft_size: 512           # FFT size.
+win_size: 512           # FFT size.
+max_frames: 8000
+fmin: 50                 # Minimum freq in mel basis calculation.
+fmax: 11025               # Maximum frequency in mel basis calculation.
+pitch_type: frame
+hidden_size: 256
+mel_loss: "ssim:0.5|l1:0.5"
+lambda_f0: 0.0
+lambda_uv: 0.0
+lambda_energy: 0.0
+lambda_ph_dur: 0.0
+lambda_sent_dur: 0.0
+lambda_word_dur: 0.0
+predictor_grad: 0.0
+use_spk_embed: true
+use_spk_id: false
+max_tokens: 20000
+max_updates: 400000
+num_spk: 100
+save_f0: true
+use_gt_dur: true
+use_gt_f0: true

NeuralSeq/configs/singing/fs2.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+base_config:
+  - configs/tts/fs2.yaml
+  - configs/singing/base.yaml

NeuralSeq/configs/tts/base.yaml ADDED Viewed

	@@ -0,0 +1,95 @@

+# task
+base_config: configs/config_base.yaml
+task_cls: ''
+#############
+# dataset
+#############
+raw_data_dir: ''
+processed_data_dir: ''
+binary_data_dir: ''
+dict_dir: ''
+pre_align_cls: ''
+binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
+pre_align_args:
+  use_tone: true # for ZH
+  forced_align: mfa
+  use_sox: false
+  txt_processor: en
+  allow_no_txt: false
+  denoise: false
+binarization_args:
+  shuffle: false
+  with_txt: true
+  with_wav: false
+  with_align: true
+  with_spk_embed: true
+  with_f0: true
+  with_f0cwt: true
+loud_norm: false
+endless_ds: true
+reset_phone_dict: true
+test_num: 100
+valid_num: 100
+max_frames: 1550
+max_input_tokens: 1550
+audio_num_mel_bins: 80
+audio_sample_rate: 22050
+hop_size: 256  # For 22050Hz, 275 ~= 12.5 ms (0.0125 * sample_rate)
+win_size: 1024  # For 22050Hz, 1100 ~= 50 ms (If None, win_size: fft_size) (0.05 * sample_rate)
+fmin: 80  # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To test depending on dataset. Pitch info: male~[65, 260], female~[100, 525])
+fmax: 7600  # To be increased/reduced depending on data.
+fft_size: 1024  # Extra window size is filled with 0 paddings to match this parameter
+min_level_db: -100
+num_spk: 1
+mel_vmin: -6
+mel_vmax: 1.5
+ds_workers: 4
+#########
+# model
+#########
+dropout: 0.1
+enc_layers: 4
+dec_layers: 4
+hidden_size: 384
+num_heads: 2
+prenet_dropout: 0.5
+prenet_hidden_size: 256
+stop_token_weight: 5.0
+enc_ffn_kernel_size: 9
+dec_ffn_kernel_size: 9
+ffn_act: gelu
+ffn_padding: 'SAME'
+###########
+# optimization
+###########
+lr: 2.0
+warmup_updates: 8000
+optimizer_adam_beta1: 0.9
+optimizer_adam_beta2: 0.98
+weight_decay: 0
+clip_grad_norm: 1
+###########
+# train and eval
+###########
+max_tokens: 30000
+max_sentences: 100000
+max_eval_sentences: 1
+max_eval_tokens: 60000
+train_set_name: 'train'
+valid_set_name: 'valid'
+test_set_name: 'test'
+vocoder: pwg
+vocoder_ckpt: ''
+profile_infer: false
+out_wav_norm: false
+save_gt: false
+save_f0: false
+gen_dir_name: ''
+use_denoise: false

NeuralSeq/configs/tts/base_zh.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+pre_align_args:
+  txt_processor: zh_g2pM
+binarizer_cls: data_gen.tts.binarizer_zh.ZhBinarizer

NeuralSeq/configs/tts/emotion/base_text2mel.yaml ADDED Viewed

	@@ -0,0 +1,17 @@

+raw_data_dir: 'data/raw/ESD'
+processed_data_dir: 'data/processed/emotion'
+binary_data_dir: 'data/binary/emotion'
+pre_align_cls: configs.tts.emotion.pre_align.EmoPreAlign
+audio_sample_rate: 16000
+binarization_args:
+  shuffle: true
+binarizer_cls: data_gen.tts.base_binarizer_emotion.EmotionBinarizer
+use_spk_id: true
+test_num: 200
+num_spk: 10
+pitch_type: frame
+min_frames: 128
+num_test_samples: 30
+mel_loss: "ssim:0.5|l1:0.5"
+vocoder_ckpt: ''
+use_emotion: true

NeuralSeq/configs/tts/emotion/pre_align.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import os
+from data_gen.tts.base_preprocess import BasePreprocessor
+import glob
+import re
+class EmoPreAlign(BasePreprocessor):
+    def meta_data(self):
+        spks = ['0012', '0011', '0013', '0014', '0015', '0016', '0017', '0018', '0019', '0020']
+        pattern = re.compile('[\t\n ]+')
+        for spk in spks:
+            for line in open(f"{self.raw_data_dir}/{spk}/{spk}.txt", 'r'):  # 打开文件
+                line = re.sub(pattern, ' ', line)
+                if line == ' ': continue
+                split_ = line.split(' ')
+                txt = ' '.join(split_[1: -2])
+                item_name = split_[0]
+                emotion = split_[-2]
+                wav_fn = f'{self.raw_data_dir}/{spk}/{emotion}/{item_name}.wav'
+                yield item_name, wav_fn, txt, spk, emotion
+if __name__ == "__main__":
+    EmoPreAlign().process()

NeuralSeq/configs/tts/fs2.yaml ADDED Viewed

	@@ -0,0 +1,80 @@

+base_config: configs/tts/base.yaml
+task_cls: tasks.tts.fs2.FastSpeech2Task
+# model
+hidden_size: 256
+dropout: 0.1
+encoder_type: fft # fft|tacotron|tacotron2|conformer
+encoder_K: 8 # for tacotron encoder
+decoder_type: fft # fft|rnn|conv|conformer
+use_pos_embed: true
+# duration
+predictor_hidden: -1
+predictor_kernel: 5
+predictor_layers: 2
+dur_predictor_kernel: 3
+dur_predictor_layers: 2
+predictor_dropout: 0.5
+# pitch and energy
+use_pitch_embed: true
+pitch_type: ph # frame|ph|cwt
+use_uv: true
+cwt_hidden_size: 128
+cwt_layers: 2
+cwt_loss: l1
+cwt_add_f0_loss: false
+cwt_std_scale: 0.8
+pitch_ar: false
+#pitch_embed_type: 0q
+pitch_loss: 'l1' # l1|l2|ssim
+pitch_norm: log
+use_energy_embed: false
+# reference encoder and speaker embedding
+use_spk_id: false
+use_split_spk_id: false
+use_spk_embed: false
+use_var_enc: false
+lambda_commit: 0.25
+ref_norm_layer: bn
+pitch_enc_hidden_stride_kernel:
+  - 0,2,5 # conv_hidden_size, conv_stride, conv_kernel_size. conv_hidden_size=0: use hidden_size
+  - 0,2,5
+  - 0,2,5
+dur_enc_hidden_stride_kernel:
+  - 0,2,3 # conv_hidden_size, conv_stride, conv_kernel_size. conv_hidden_size=0: use hidden_size
+  - 0,2,3
+  - 0,1,3
+# mel
+mel_loss: l1:0.5|ssim:0.5 # l1|l2|gdl|ssim or l1:0.5|ssim:0.5
+# loss lambda
+lambda_f0: 1.0
+lambda_uv: 1.0
+lambda_energy: 0.1
+lambda_ph_dur: 1.0
+lambda_sent_dur: 1.0
+lambda_word_dur: 1.0
+predictor_grad: 0.1
+# train and eval
+pretrain_fs_ckpt: ''
+warmup_updates: 2000
+max_tokens: 32000
+max_sentences: 100000
+max_eval_sentences: 1
+max_updates: 120000
+num_valid_plots: 5
+num_test_samples: 0
+test_ids: []
+use_gt_dur: false
+use_gt_f0: false
+# exp
+dur_loss: mse # huber|mol
+norm_type: gn

NeuralSeq/configs/tts/hifigan.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+base_config: configs/tts/pwg.yaml
+task_cls: tasks.vocoder.hifigan.HifiGanTask
+resblock: "1"
+adam_b1: 0.8
+adam_b2: 0.99
+upsample_rates: [ 8,8,2,2 ]
+upsample_kernel_sizes: [ 16,16,4,4 ]
+upsample_initial_channel: 128
+resblock_kernel_sizes: [ 3,7,11 ]
+resblock_dilation_sizes: [ [ 1,3,5 ], [ 1,3,5 ], [ 1,3,5 ] ]
+lambda_mel: 45.0
+max_samples: 8192
+max_sentences: 16
+generator_params:
+  lr: 0.0002            # Generator's learning rate.
+  aux_context_window: 0 # Context window size for auxiliary feature.
+discriminator_optimizer_params:
+  lr: 0.0002            # Discriminator's learning rate.

NeuralSeq/configs/tts/libritts/__pycache__/pre_align.cpython-38.pyc ADDED Viewed

Binary file (981 Bytes). View file

NeuralSeq/configs/tts/libritts/base_text2mel.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+raw_data_dir: 'data/raw/LibriTTS'
+processed_data_dir: 'data/processed/libritts'
+binary_data_dir: 'data/binary/libritts'
+pre_align_cls: configs.tts.libritts.pre_align.LibrittsPreAlign
+binarization_args:
+  shuffle: true
+use_spk_id: true
+test_num: 200
+num_spk: 2320
+pitch_type: frame
+min_frames: 128
+num_test_samples: 30
+mel_loss: "ssim:0.5|l1:0.5"
+vocoder_ckpt: ''

NeuralSeq/configs/tts/libritts/fs2.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+base_config:
+  - configs/tts/fs2.yaml
+  - ./base_text2mel.yaml

NeuralSeq/configs/tts/libritts/pre_align.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import os
+from data_gen.tts.base_preprocess import BasePreprocessor
+import glob
+class LibrittsPreAlign(BasePreprocessor):
+    def meta_data(self):
+        wav_fns = sorted(glob.glob(f'{self.raw_data_dir}/*/*/*.wav'))
+        for wav_fn in wav_fns:
+            item_name = os.path.basename(wav_fn)[:-4]
+            txt_fn = f'{wav_fn[:-4]}.normalized.txt'
+            with open(txt_fn, 'r') as f:
+                txt = f.readlines()
+                f.close()
+            spk = item_name.split("_")[0]
+            # Example:
+            #
+            # 'item_name': '103_1241_000000_000001'
+            # 'wav_fn': 'LibriTTS/train-clean-100/103/1241/103_1241_000000_000001.wav'
+            # 'txt': 'matthew Cuthbert is surprised'
+            # 'spk_name': '103'
+            yield {'item_name': item_name, 'wav_fn': wav_fn, 'txt': txt[0], 'spk_name': spk}
+if __name__ == "__main__":
+    LibrittsPreAlign().process()

NeuralSeq/configs/tts/libritts/pwg.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+base_config: egs/egs_bases/tts/vocoder/pwg.yaml
+raw_data_dir: 'data/raw/LibriTTS'
+processed_data_dir: 'data/processed/libritts'
+binary_data_dir: 'data/binary/libritts_wav'
+generator_params:
+  kernel_size: 5
+num_spk: 400
+max_samples: 20480

NeuralSeq/configs/tts/lj/base_mel2wav.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+raw_data_dir: 'data/raw/LJSpeech-1.1'
+processed_data_dir: 'data/processed/ljspeech'
+binary_data_dir: 'data/binary/ljspeech_wav'

NeuralSeq/configs/tts/lj/base_text2mel.yaml ADDED Viewed

	@@ -0,0 +1,13 @@

+raw_data_dir: 'data/raw/LJSpeech-1.1'
+processed_data_dir: 'data/processed/ljspeech'
+binary_data_dir: 'data/binary/ljspeech'
+pre_align_cls: data_gen.tts.lj.pre_align.LJPreAlign
+pitch_type: cwt
+mel_loss: l1
+num_test_samples: 20
+test_ids: [ 68, 70, 74, 87, 110, 172, 190, 215, 231, 294,
+            316, 324, 402, 422, 485, 500, 505, 508, 509, 519 ]
+use_energy_embed: false
+test_num: 523
+valid_num: 348

NeuralSeq/configs/tts/lj/fs2.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+base_config:
+  - configs/tts/fs2.yaml
+  - configs/tts/lj/base_text2mel.yaml

NeuralSeq/configs/tts/lj/hifigan.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+base_config:
+  - configs/tts/hifigan.yaml
+  - configs/tts/lj/base_mel2wav.yaml

NeuralSeq/configs/tts/lj/pwg.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+base_config:
+  - configs/tts/pwg.yaml
+  - configs/tts/lj/base_mel2wav.yaml

NeuralSeq/configs/tts/pwg.yaml ADDED Viewed

	@@ -0,0 +1,110 @@

+base_config: configs/tts/base.yaml
+task_cls: tasks.vocoder.pwg.PwgTask
+binarization_args:
+  with_wav: true
+  with_spk_embed: false
+  with_align: false
+test_input_dir: ''
+###########
+# train and eval
+###########
+max_samples: 25600
+max_sentences: 5
+max_eval_sentences: 1
+max_updates: 1000000
+val_check_interval: 2000
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 22050     # Sampling rate.
+fft_size: 1024           # FFT size.
+hop_size: 256            # Hop size.
+win_length: null         # Window length.
+# If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+format: "hdf5"           # Feature file format. "npy" or "hdf5" is supported.
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_params:
+  in_channels: 1        # Number of input channels.
+  out_channels: 1       # Number of output channels.
+  kernel_size: 3        # Kernel size of dilated convolution.
+  layers: 30            # Number of residual block layers.
+  stacks: 3             # Number of stacks i.e., dilation cycles.
+  residual_channels: 64 # Number of channels in residual conv.
+  gate_channels: 128    # Number of channels in gated conv.
+  skip_channels: 64     # Number of channels in skip conv.
+  aux_channels: 80      # Number of channels for auxiliary feature conv.
+  # Must be the same as num_mels.
+  aux_context_window: 2 # Context window size for auxiliary feature.
+  # If set to 2, previous 2 and future 2 frames will be considered.
+  dropout: 0.0          # Dropout rate. 0.0 means no dropout applied.
+  use_weight_norm: true # Whether to use weight norm.
+  # If set to true, it will be applied to all of the conv layers.
+  upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
+  upsample_params:                      # Upsampling network parameters.
+    upsample_scales: [4, 4, 4, 4]     # Upsampling scales. Prodcut of these must be the same as hop size.
+  use_pitch_embed: false
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+  in_channels: 1        # Number of input channels.
+  out_channels: 1       # Number of output channels.
+  kernel_size: 3        # Number of output channels.
+  layers: 10            # Number of conv layers.
+  conv_channels: 64     # Number of chnn layers.
+  bias: true            # Whether to use bias parameter in conv.
+  use_weight_norm: true # Whether to use weight norm.
+  # If set to true, it will be applied to all of the conv layers.
+  nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
+  nonlinear_activation_params:      # Nonlinear function parameters
+    negative_slope: 0.2           # Alpha in LeakyReLU.
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+  fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+  hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+  win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+  window: "hann_window"         # Window function for STFT-based loss
+use_mel_loss: false
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_adv: 4.0  # Loss balancing coefficient.
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+  lr: 0.0001             # Generator's learning rate.
+  eps: 1.0e-6            # Generator's epsilon.
+  weight_decay: 0.0      # Generator's weight decay coefficient.
+generator_scheduler_params:
+  step_size: 200000      # Generator's scheduler step size.
+  gamma: 0.5             # Generator's scheduler gamma.
+  # At each step size, lr will be multiplied by this parameter.
+generator_grad_norm: 10    # Generator's gradient norm.
+discriminator_optimizer_params:
+  lr: 0.00005            # Discriminator's learning rate.
+  eps: 1.0e-6            # Discriminator's epsilon.
+  weight_decay: 0.0      # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+  step_size: 200000      # Discriminator's scheduler step size.
+  gamma: 0.5             # Discriminator's scheduler gamma.
+  # At each step size, lr will be multiplied by this parameter.
+discriminator_grad_norm: 1 # Discriminator's gradient norm.
+disc_start_steps: 40000 # Number of steps to start to train discriminator.

NeuralSeq/data_gen/tts/__pycache__/base_binarizer.cpython-38.pyc ADDED Viewed

Binary file (8.23 kB). View file

NeuralSeq/data_gen/tts/__pycache__/base_binarizer_emotion.cpython-38.pyc ADDED Viewed

Binary file (13.3 kB). View file

NeuralSeq/data_gen/tts/__pycache__/base_preprocess.cpython-38.pyc ADDED Viewed

Binary file (11.1 kB). View file

NeuralSeq/data_gen/tts/__pycache__/data_gen_utils.cpython-37.pyc ADDED Viewed

Binary file (11 kB). View file

NeuralSeq/data_gen/tts/__pycache__/data_gen_utils.cpython-38.pyc ADDED Viewed

Binary file (11 kB). View file

NeuralSeq/data_gen/tts/base_binarizer.py ADDED Viewed

	@@ -0,0 +1,224 @@

+import os
+os.environ["OMP_NUM_THREADS"] = "1"
+from utils.multiprocess_utils import chunked_multiprocess_run
+import random
+import traceback
+import json
+from resemblyzer import VoiceEncoder
+from tqdm import tqdm
+from data_gen.tts.data_gen_utils import get_mel2ph, get_pitch, build_phone_encoder
+from utils.hparams import set_hparams, hparams
+import numpy as np
+from utils.indexed_datasets import IndexedDatasetBuilder
+from vocoders.base_vocoder import VOCODERS
+import pandas as pd
+class BinarizationError(Exception):
+    pass
+class BaseBinarizer:
+    def __init__(self, processed_data_dir=None):
+        if processed_data_dir is None:
+            processed_data_dir = hparams['processed_data_dir']
+        self.processed_data_dirs = processed_data_dir.split(",")
+        self.binarization_args = hparams['binarization_args']
+        self.pre_align_args = hparams['pre_align_args']
+        self.forced_align = self.pre_align_args['forced_align']
+        tg_dir = None
+        if self.forced_align == 'mfa':
+            tg_dir = 'mfa_outputs'
+        if self.forced_align == 'kaldi':
+            tg_dir = 'kaldi_outputs'
+        self.item2txt = {}
+        self.item2ph = {}
+        self.item2wavfn = {}
+        self.item2tgfn = {}
+        self.item2spk = {}
+        for ds_id, processed_data_dir in enumerate(self.processed_data_dirs):
+            self.meta_df = pd.read_csv(f"{processed_data_dir}/metadata_phone.csv", dtype=str)
+            for r_idx, r in self.meta_df.iterrows():
+                item_name = raw_item_name = r['item_name']
+                if len(self.processed_data_dirs) > 1:
+                    item_name = f'ds{ds_id}_{item_name}'
+                self.item2txt[item_name] = r['txt']
+                self.item2ph[item_name] = r['ph']
+                self.item2wavfn[item_name] = os.path.join(hparams['raw_data_dir'], 'wavs', os.path.basename(r['wav_fn']).split('_')[1])
+                self.item2spk[item_name] = r.get('spk', 'SPK1')
+                if len(self.processed_data_dirs) > 1:
+                    self.item2spk[item_name] = f"ds{ds_id}_{self.item2spk[item_name]}"
+                if tg_dir is not None:
+                    self.item2tgfn[item_name] = f"{processed_data_dir}/{tg_dir}/{raw_item_name}.TextGrid"
+        self.item_names = sorted(list(self.item2txt.keys()))
+        if self.binarization_args['shuffle']:
+            random.seed(1234)
+            random.shuffle(self.item_names)
+    @property
+    def train_item_names(self):
+        return self.item_names[hparams['test_num']+hparams['valid_num']:]
+    @property
+    def valid_item_names(self):
+        return self.item_names[0: hparams['test_num']+hparams['valid_num']]  #
+    @property
+    def test_item_names(self):
+        return self.item_names[0: hparams['test_num']]  # Audios for MOS testing are in 'test_ids'
+    def build_spk_map(self):
+        spk_map = set()
+        for item_name in self.item_names:
+            spk_name = self.item2spk[item_name]
+            spk_map.add(spk_name)
+        spk_map = {x: i for i, x in enumerate(sorted(list(spk_map)))}
+        assert len(spk_map) == 0 or len(spk_map) <= hparams['num_spk'], len(spk_map)
+        return spk_map
+    def item_name2spk_id(self, item_name):
+        return self.spk_map[self.item2spk[item_name]]
+    def _phone_encoder(self):
+        ph_set_fn = f"{hparams['binary_data_dir']}/phone_set.json"
+        ph_set = []
+        if hparams['reset_phone_dict'] or not os.path.exists(ph_set_fn):
+            for processed_data_dir in self.processed_data_dirs:
+                ph_set += [x.split(' ')[0] for x in open(f'{processed_data_dir}/dict.txt').readlines()]
+            ph_set = sorted(set(ph_set))
+            json.dump(ph_set, open(ph_set_fn, 'w'))
+        else:
+            ph_set = json.load(open(ph_set_fn, 'r'))
+        print("| phone set: ", ph_set)
+        return build_phone_encoder(hparams['binary_data_dir'])
+    def meta_data(self, prefix):
+        if prefix == 'valid':
+            item_names = self.valid_item_names
+        elif prefix == 'test':
+            item_names = self.test_item_names
+        else:
+            item_names = self.train_item_names
+        for item_name in item_names:
+            ph = self.item2ph[item_name]
+            txt = self.item2txt[item_name]
+            tg_fn = self.item2tgfn.get(item_name)
+            wav_fn = self.item2wavfn[item_name]
+            spk_id = self.item_name2spk_id(item_name)
+            yield item_name, ph, txt, tg_fn, wav_fn, spk_id
+    def process(self):
+        os.makedirs(hparams['binary_data_dir'], exist_ok=True)
+        self.spk_map = self.build_spk_map()
+        print("| spk_map: ", self.spk_map)
+        spk_map_fn = f"{hparams['binary_data_dir']}/spk_map.json"
+        json.dump(self.spk_map, open(spk_map_fn, 'w'))
+        self.phone_encoder = self._phone_encoder()
+        self.process_data('valid')
+        self.process_data('test')
+        self.process_data('train')
+    def process_data(self, prefix):
+        data_dir = hparams['binary_data_dir']
+        args = []
+        builder = IndexedDatasetBuilder(f'{data_dir}/{prefix}')
+        lengths = []
+        f0s = []
+        total_sec = 0
+        if self.binarization_args['with_spk_embed']:
+            voice_encoder = VoiceEncoder().cuda()
+        meta_data = list(self.meta_data(prefix))
+        for m in meta_data:
+            args.append(list(m) + [self.phone_encoder, self.binarization_args])
+        num_workers = int(os.getenv('N_PROC', os.cpu_count() // 3))
+        for f_id, (_, item) in enumerate(
+                zip(tqdm(meta_data), chunked_multiprocess_run(self.process_item, args, num_workers=num_workers))):
+            if item is None:
+                continue
+            item['spk_embed'] = voice_encoder.embed_utterance(item['wav']) \
+                if self.binarization_args['with_spk_embed'] else None
+            if not self.binarization_args['with_wav'] and 'wav' in item:
+                print("del wav")
+                del item['wav']
+            builder.add_item(item)
+            lengths.append(item['len'])
+            total_sec += item['sec']
+            if item.get('f0') is not None:
+                f0s.append(item['f0'])
+        builder.finalize()
+        np.save(f'{data_dir}/{prefix}_lengths.npy', lengths)
+        if len(f0s) > 0:
+            f0s = np.concatenate(f0s, 0)
+            f0s = f0s[f0s != 0]
+            np.save(f'{data_dir}/{prefix}_f0s_mean_std.npy', [np.mean(f0s).item(), np.std(f0s).item()])
+        print(f"| {prefix} total duration: {total_sec:.3f}s")
+    @classmethod
+    def process_item(cls, item_name, ph, txt, tg_fn, wav_fn, spk_id, encoder, binarization_args):
+        if hparams['vocoder'] in VOCODERS:
+            wav, mel = VOCODERS[hparams['vocoder']].wav2spec(wav_fn)
+        else:
+            wav, mel = VOCODERS[hparams['vocoder'].split('.')[-1]].wav2spec(wav_fn)
+        res = {
+            'item_name': item_name, 'txt': txt, 'ph': ph, 'mel': mel, 'wav': wav, 'wav_fn': wav_fn,
+            'sec': len(wav) / hparams['audio_sample_rate'], 'len': mel.shape[0], 'spk_id': spk_id
+        }
+        try:
+            if binarization_args['with_f0']:
+                cls.get_pitch(wav, mel, res)
+                if binarization_args['with_f0cwt']:
+                    cls.get_f0cwt(res['f0'], res)
+            if binarization_args['with_txt']:
+                try:
+                    phone_encoded = res['phone'] = encoder.encode(ph)
+                except:
+                    traceback.print_exc()
+                    raise BinarizationError(f"Empty phoneme")
+                if binarization_args['with_align']:
+                    cls.get_align(tg_fn, ph, mel, phone_encoded, res)
+        except BinarizationError as e:
+            print(f"| Skip item ({e}). item_name: {item_name}, wav_fn: {wav_fn}")
+            return None
+        return res
+    @staticmethod
+    def get_align(tg_fn, ph, mel, phone_encoded, res):
+        if tg_fn is not None and os.path.exists(tg_fn):
+            mel2ph, dur = get_mel2ph(tg_fn, ph, mel, hparams)
+        else:
+            raise BinarizationError(f"Align not found")
+        if mel2ph.max() - 1 >= len(phone_encoded):
+            raise BinarizationError(
+                f"Align does not match: mel2ph.max() - 1: {mel2ph.max() - 1}, len(phone_encoded): {len(phone_encoded)}")
+        res['mel2ph'] = mel2ph
+        res['dur'] = dur
+    @staticmethod
+    def get_pitch(wav, mel, res):
+        f0, pitch_coarse = get_pitch(wav, mel, hparams)
+        if sum(f0) == 0:
+            raise BinarizationError("Empty f0")
+        res['f0'] = f0
+        res['pitch'] = pitch_coarse
+    @staticmethod
+    def get_f0cwt(f0, res):
+        from utils.cwt import get_cont_lf0, get_lf0_cwt
+        uv, cont_lf0_lpf = get_cont_lf0(f0)
+        logf0s_mean_org, logf0s_std_org = np.mean(cont_lf0_lpf), np.std(cont_lf0_lpf)
+        cont_lf0_lpf_norm = (cont_lf0_lpf - logf0s_mean_org) / logf0s_std_org
+        Wavelet_lf0, scales = get_lf0_cwt(cont_lf0_lpf_norm)
+        if np.any(np.isnan(Wavelet_lf0)):
+            raise BinarizationError("NaN CWT")
+        res['cwt_spec'] = Wavelet_lf0
+        res['cwt_scales'] = scales
+        res['f0_mean'] = logf0s_mean_org
+        res['f0_std'] = logf0s_std_org
+if __name__ == "__main__":
+    set_hparams()
+    BaseBinarizer().process()

NeuralSeq/data_gen/tts/base_binarizer_emotion.py ADDED Viewed

	@@ -0,0 +1,352 @@

+import os
+os.environ["OMP_NUM_THREADS"] = "1"
+import torch
+from collections import Counter
+from utils.text_encoder import TokenTextEncoder
+from data_gen.tts.emotion import inference as EmotionEncoder
+from data_gen.tts.emotion.inference import embed_utterance as Embed_utterance
+from data_gen.tts.emotion.inference import preprocess_wav
+from utils.multiprocess_utils import chunked_multiprocess_run
+import random
+import traceback
+import json
+from resemblyzer import VoiceEncoder
+from tqdm import tqdm
+from data_gen.tts.data_gen_utils import get_mel2ph, get_pitch, build_phone_encoder, is_sil_phoneme
+from utils.hparams import hparams, set_hparams
+import numpy as np
+from utils.indexed_datasets import IndexedDatasetBuilder
+from vocoders.base_vocoder import get_vocoder_cls
+import pandas as pd
+class BinarizationError(Exception):
+    pass
+class EmotionBinarizer:
+    def __init__(self, processed_data_dir=None):
+        if processed_data_dir is None:
+            processed_data_dir = hparams['processed_data_dir']
+        self.processed_data_dirs = processed_data_dir.split(",")
+        self.binarization_args = hparams['binarization_args']
+        self.pre_align_args = hparams['pre_align_args']
+        self.item2txt = {}
+        self.item2ph = {}
+        self.item2wavfn = {}
+        self.item2tgfn = {}
+        self.item2spk = {}
+        self.item2emo = {}
+    def load_meta_data(self):
+        for ds_id, processed_data_dir in enumerate(self.processed_data_dirs):
+            self.meta_df = pd.read_csv(f"{processed_data_dir}/metadata_phone.csv", dtype=str)
+            for r_idx, r in tqdm(self.meta_df.iterrows(), desc='Loading meta data.'):
+                item_name = raw_item_name = r['item_name']
+                if len(self.processed_data_dirs) > 1:
+                    item_name = f'ds{ds_id}_{item_name}'
+                self.item2txt[item_name] = r['txt']
+                self.item2ph[item_name] = r['ph']
+                self.item2wavfn[item_name] = r['wav_fn']
+                self.item2spk[item_name] = r.get('spk_name', 'SPK1') \
+                    if self.binarization_args['with_spk_id'] else 'SPK1'
+                if len(self.processed_data_dirs) > 1:
+                    self.item2spk[item_name] = f"ds{ds_id}_{self.item2spk[item_name]}"
+                self.item2tgfn[item_name] = f"{processed_data_dir}/mfa_outputs/{raw_item_name}.TextGrid"
+                self.item2emo[item_name] = r.get('others', '"Neutral"')
+        self.item_names = sorted(list(self.item2txt.keys()))
+        if self.binarization_args['shuffle']:
+            random.seed(1234)
+            random.shuffle(self.item_names)
+    @property
+    def train_item_names(self):
+        return self.item_names[hparams['test_num']:]
+    @property
+    def valid_item_names(self):
+        return self.item_names[:hparams['test_num']]
+    @property
+    def test_item_names(self):
+        return self.valid_item_names
+    def build_spk_map(self):
+        spk_map = set()
+        for item_name in self.item_names:
+            spk_name = self.item2spk[item_name]
+            spk_map.add(spk_name)
+        spk_map = {x: i for i, x in enumerate(sorted(list(spk_map)))}
+        print("| #Spk: ", len(spk_map))
+        assert len(spk_map) == 0 or len(spk_map) <= hparams['num_spk'], len(spk_map)
+        return spk_map
+    def build_emo_map(self):
+        emo_map = set()
+        for item_name in self.item_names:
+            emo_name = self.item2emo[item_name]
+            emo_map.add(emo_name)
+        emo_map = {x: i for i, x in enumerate(sorted(list(emo_map)))}
+        print("| #Emo: ", len(emo_map))
+        return emo_map
+    def item_name2spk_id(self, item_name):
+        return self.spk_map[self.item2spk[item_name]]
+    def item_name2emo_id(self, item_name):
+        return self.emo_map[self.item2emo[item_name]]
+    def _phone_encoder(self):
+        ph_set_fn = f"{hparams['binary_data_dir']}/phone_set.json"
+        ph_set = []
+        if self.binarization_args['reset_phone_dict'] or not os.path.exists(ph_set_fn):
+            for ph_sent in self.item2ph.values():
+                ph_set += ph_sent.split(' ')
+            ph_set = sorted(set(ph_set))
+            json.dump(ph_set, open(ph_set_fn, 'w'))
+            print("| Build phone set: ", ph_set)
+        else:
+            ph_set = json.load(open(ph_set_fn, 'r'))
+            print("| Load phone set: ", ph_set)
+        return build_phone_encoder(hparams['binary_data_dir'])
+    def _word_encoder(self):
+        fn = f"{hparams['binary_data_dir']}/word_set.json"
+        word_set = []
+        if self.binarization_args['reset_word_dict']:
+            for word_sent in self.item2txt.values():
+                word_set += [x for x in word_sent.split(' ') if x != '']
+            word_set = Counter(word_set)
+            total_words = sum(word_set.values())
+            word_set = word_set.most_common(hparams['word_size'])
+            num_unk_words = total_words - sum([x[1] for x in word_set])
+            word_set = [x[0] for x in word_set]
+            json.dump(word_set, open(fn, 'w'))
+            print(f"| Build word set. Size: {len(word_set)}, #total words: {total_words},"
+                  f" #unk_words: {num_unk_words}, word_set[:10]:, {word_set[:10]}.")
+        else:
+            word_set = json.load(open(fn, 'r'))
+            print("| Load word set. Size: ", len(word_set), word_set[:10])
+        return TokenTextEncoder(None, vocab_list=word_set, replace_oov='<UNK>')
+    def meta_data(self, prefix):
+        if prefix == 'valid':
+            item_names = self.valid_item_names
+        elif prefix == 'test':
+            item_names = self.test_item_names
+        else:
+            item_names = self.train_item_names
+        for item_name in item_names:
+            ph = self.item2ph[item_name]
+            txt = self.item2txt[item_name]
+            tg_fn = self.item2tgfn.get(item_name)
+            wav_fn = self.item2wavfn[item_name]
+            spk_id = self.item_name2spk_id(item_name)
+            emotion = self.item_name2emo_id(item_name)
+            yield item_name, ph, txt, tg_fn, wav_fn, spk_id, emotion
+    def process(self):
+        self.load_meta_data()
+        os.makedirs(hparams['binary_data_dir'], exist_ok=True)
+        self.spk_map = self.build_spk_map()
+        print("| spk_map: ", self.spk_map)
+        spk_map_fn = f"{hparams['binary_data_dir']}/spk_map.json"
+        json.dump(self.spk_map, open(spk_map_fn, 'w'))
+        self.emo_map = self.build_emo_map()
+        print("| emo_map: ", self.emo_map)
+        emo_map_fn = f"{hparams['binary_data_dir']}/emo_map.json"
+        json.dump(self.emo_map, open(emo_map_fn, 'w'))
+        self.phone_encoder = self._phone_encoder()
+        self.word_encoder = None
+        EmotionEncoder.load_model(hparams['emotion_encoder_path'])
+        if self.binarization_args['with_word']:
+            self.word_encoder = self._word_encoder()
+        self.process_data('valid')
+        self.process_data('test')
+        self.process_data('train')
+    def process_data(self, prefix):
+        data_dir = hparams['binary_data_dir']
+        args = []
+        builder = IndexedDatasetBuilder(f'{data_dir}/{prefix}')
+        ph_lengths = []
+        mel_lengths = []
+        f0s = []
+        total_sec = 0
+        if self.binarization_args['with_spk_embed']:
+            voice_encoder = VoiceEncoder().cuda()
+        meta_data = list(self.meta_data(prefix))
+        for m in meta_data:
+            args.append(list(m) + [(self.phone_encoder, self.word_encoder), self.binarization_args])
+        num_workers = self.num_workers
+        for f_id, (_, item) in enumerate(
+                zip(tqdm(meta_data), chunked_multiprocess_run(self.process_item, args, num_workers=num_workers))):
+            if item is None:
+                continue
+            item['spk_embed'] = voice_encoder.embed_utterance(item['wav']) \
+                if self.binarization_args['with_spk_embed'] else None
+            processed_wav = preprocess_wav(item['wav_fn'])
+            item['emo_embed'] = Embed_utterance(processed_wav)
+            if not self.binarization_args['with_wav'] and 'wav' in item:
+                del item['wav']
+            builder.add_item(item)
+            mel_lengths.append(item['len'])
+            if 'ph_len' in item:
+                ph_lengths.append(item['ph_len'])
+            total_sec += item['sec']
+            if item.get('f0') is not None:
+                f0s.append(item['f0'])
+        builder.finalize()
+        np.save(f'{data_dir}/{prefix}_lengths.npy', mel_lengths)
+        if len(ph_lengths) > 0:
+            np.save(f'{data_dir}/{prefix}_ph_lengths.npy', ph_lengths)
+        if len(f0s) > 0:
+            f0s = np.concatenate(f0s, 0)
+            f0s = f0s[f0s != 0]
+            np.save(f'{data_dir}/{prefix}_f0s_mean_std.npy', [np.mean(f0s).item(), np.std(f0s).item()])
+        print(f"| {prefix} total duration: {total_sec:.3f}s")
+    @classmethod
+    def process_item(cls, item_name, ph, txt, tg_fn, wav_fn, spk_id, emotion, encoder, binarization_args):
+        res = {'item_name': item_name, 'txt': txt, 'ph': ph, 'wav_fn': wav_fn, 'spk_id': spk_id, 'emotion': emotion}
+        if binarization_args['with_linear']:
+            wav, mel, linear_stft = get_vocoder_cls(hparams).wav2spec(wav_fn) # , return_linear=True
+            res['linear'] = linear_stft
+        else:
+            wav, mel = get_vocoder_cls(hparams).wav2spec(wav_fn)
+        wav = wav.astype(np.float16)
+        res.update({'mel': mel, 'wav': wav,
+                    'sec': len(wav) / hparams['audio_sample_rate'], 'len': mel.shape[0]})
+        try:
+            if binarization_args['with_f0']:
+                cls.get_pitch(res)
+                if binarization_args['with_f0cwt']:
+                    cls.get_f0cwt(res)
+            if binarization_args['with_txt']:
+                ph_encoder, word_encoder = encoder
+                try:
+                    res['phone'] = ph_encoder.encode(ph)
+                    res['ph_len'] = len(res['phone'])
+                except:
+                    traceback.print_exc()
+                    raise BinarizationError(f"Empty phoneme")
+                if binarization_args['with_align']:
+                    cls.get_align(tg_fn, res)
+                    if binarization_args['trim_eos_bos']:
+                        bos_dur = res['dur'][0]
+                        eos_dur = res['dur'][-1]
+                        res['mel'] = mel[bos_dur:-eos_dur]
+                        res['f0'] = res['f0'][bos_dur:-eos_dur]
+                        res['pitch'] = res['pitch'][bos_dur:-eos_dur]
+                        res['mel2ph'] = res['mel2ph'][bos_dur:-eos_dur]
+                        res['wav'] = wav[bos_dur * hparams['hop_size']:-eos_dur * hparams['hop_size']]
+                        res['dur'] = res['dur'][1:-1]
+                        res['len'] = res['mel'].shape[0]
+                if binarization_args['with_word']:
+                    cls.get_word(res, word_encoder)
+        except BinarizationError as e:
+            print(f"| Skip item ({e}). item_name: {item_name}, wav_fn: {wav_fn}")
+            return None
+        except Exception as e:
+            traceback.print_exc()
+            print(f"| Skip item. item_name: {item_name}, wav_fn: {wav_fn}")
+            return None
+        return res
+    @staticmethod
+    def get_align(tg_fn, res):
+        ph = res['ph']
+        mel = res['mel']
+        phone_encoded = res['phone']
+        if tg_fn is not None and os.path.exists(tg_fn):
+            mel2ph, dur = get_mel2ph(tg_fn, ph, mel, hparams)
+        else:
+            raise BinarizationError(f"Align not found")
+        if mel2ph.max() - 1 >= len(phone_encoded):
+            raise BinarizationError(
+                f"Align does not match: mel2ph.max() - 1: {mel2ph.max() - 1}, len(phone_encoded): {len(phone_encoded)}")
+        res['mel2ph'] = mel2ph
+        res['dur'] = dur
+    @staticmethod
+    def get_pitch(res):
+        wav, mel = res['wav'], res['mel']
+        f0, pitch_coarse = get_pitch(wav, mel, hparams)
+        if sum(f0) == 0:
+            raise BinarizationError("Empty f0")
+        res['f0'] = f0
+        res['pitch'] = pitch_coarse
+    @staticmethod
+    def get_f0cwt(res):
+        from utils.cwt import get_cont_lf0, get_lf0_cwt
+        f0 = res['f0']
+        uv, cont_lf0_lpf = get_cont_lf0(f0)
+        logf0s_mean_org, logf0s_std_org = np.mean(cont_lf0_lpf), np.std(cont_lf0_lpf)
+        cont_lf0_lpf_norm = (cont_lf0_lpf - logf0s_mean_org) / logf0s_std_org
+        Wavelet_lf0, scales = get_lf0_cwt(cont_lf0_lpf_norm)
+        if np.any(np.isnan(Wavelet_lf0)):
+            raise BinarizationError("NaN CWT")
+        res['cwt_spec'] = Wavelet_lf0
+        res['cwt_scales'] = scales
+        res['f0_mean'] = logf0s_mean_org
+        res['f0_std'] = logf0s_std_org
+    @staticmethod
+    def get_word(res, word_encoder):
+        ph_split = res['ph'].split(" ")
+        # ph side mapping to word
+        ph_words = []  # ['<BOS>', 'N_AW1_', ',', 'AE1_Z_|', 'AO1_L_|', 'B_UH1_K_S_|', 'N_AA1_T_|', ....]
+        ph2word = np.zeros([len(ph_split)], dtype=int)
+        last_ph_idx_for_word = []  # [2, 11, ...]
+        for i, ph in enumerate(ph_split):
+            if ph == '|':
+                last_ph_idx_for_word.append(i)
+            elif not ph[0].isalnum():
+                if ph not in ['<BOS>']:
+                    last_ph_idx_for_word.append(i - 1)
+                last_ph_idx_for_word.append(i)
+        start_ph_idx_for_word = [0] + [i + 1 for i in last_ph_idx_for_word[:-1]]
+        for i, (s_w, e_w) in enumerate(zip(start_ph_idx_for_word, last_ph_idx_for_word)):
+            ph_words.append(ph_split[s_w:e_w + 1])
+            ph2word[s_w:e_w + 1] = i
+        ph2word = ph2word.tolist()
+        ph_words = ["_".join(w) for w in ph_words]
+        # mel side mapping to word
+        mel2word = []
+        dur_word = [0 for _ in range(len(ph_words))]
+        for i, m2p in enumerate(res['mel2ph']):
+            word_idx = ph2word[m2p - 1]
+            mel2word.append(ph2word[m2p - 1])
+            dur_word[word_idx] += 1
+        ph2word = [x + 1 for x in ph2word]  # 0预留给padding
+        mel2word = [x + 1 for x in mel2word]  # 0预留给padding
+        res['ph_words'] = ph_words  # [T_word]
+        res['ph2word'] = ph2word  # [T_ph]
+        res['mel2word'] = mel2word  # [T_mel]
+        res['dur_word'] = dur_word  # [T_word]
+        words = [x for x in res['txt'].split(" ") if x != '']
+        while len(words) > 0 and is_sil_phoneme(words[0]):
+            words = words[1:]
+        while len(words) > 0 and is_sil_phoneme(words[-1]):
+            words = words[:-1]
+        words = ['<BOS>'] + words + ['<EOS>']
+        word_tokens = word_encoder.encode(" ".join(words))
+        res['words'] = words
+        res['word_tokens'] = word_tokens
+        assert len(words) == len(ph_words), [words, ph_words]
+    @property
+    def num_workers(self):
+        return int(os.getenv('N_PROC', hparams.get('N_PROC', os.cpu_count())))
+if __name__ == "__main__":
+    set_hparams()
+    EmotionBinarizer().process()

NeuralSeq/data_gen/tts/base_preprocess.py ADDED Viewed

	@@ -0,0 +1,254 @@

+import json
+import os
+import random
+import re
+import traceback
+from collections import Counter
+from functools import partial
+import pandas as pd
+import librosa
+from tqdm import tqdm
+from data_gen.tts.txt_processors.base_text_processor import get_txt_processor_cls
+from data_gen.tts.wav_processors.base_processor import get_wav_processor_cls
+from utils.hparams import hparams
+from utils.multiprocess_utils import multiprocess_run_tqdm
+from utils.os_utils import link_file, move_file, remove_file
+from data_gen.tts.data_gen_utils import is_sil_phoneme, build_token_encoder
+class BasePreprocessor:
+    def __init__(self):
+        self.preprocess_args = hparams['preprocess_args']
+        txt_processor = self.preprocess_args['txt_processor']
+        self.txt_processor = get_txt_processor_cls(txt_processor)
+        self.raw_data_dir = hparams['raw_data_dir']
+        self.processed_dir = hparams['processed_data_dir']
+        self.spk_map_fn = f"{self.processed_dir}/spk_map.json"
+    def meta_data(self):
+        """
+        :return: {'item_name': Str, 'wav_fn': Str, 'txt': Str, 'spk_name': Str, 'txt_loader': None or Func}
+        """
+        raise NotImplementedError
+    def process(self):
+        processed_dir = self.processed_dir
+        wav_processed_tmp_dir = f'{processed_dir}/processed_tmp'
+        remove_file(wav_processed_tmp_dir)
+        os.makedirs(wav_processed_tmp_dir, exist_ok=True)
+        wav_processed_dir = f'{processed_dir}/{self.wav_processed_dirname}'
+        remove_file(wav_processed_dir)
+        os.makedirs(wav_processed_dir, exist_ok=True)
+        meta_data = list(tqdm(self.meta_data(), desc='Load meta data'))
+        item_names = [d['item_name'] for d in meta_data]
+        assert len(item_names) == len(set(item_names)), 'Key `item_name` should be Unique.'
+        # preprocess data
+        phone_list = []
+        word_list = []
+        spk_names = set()
+        process_item = partial(self.preprocess_first_pass,
+                               txt_processor=self.txt_processor,
+                               wav_processed_dir=wav_processed_dir,
+                               wav_processed_tmp=wav_processed_tmp_dir,
+                               preprocess_args=self.preprocess_args)
+        items = []
+        args = [{
+            'item_name': item_raw['item_name'],
+            'txt_raw': item_raw['txt'],
+            'wav_fn': item_raw['wav_fn'],
+            'txt_loader': item_raw.get('txt_loader'),
+            'others': item_raw.get('others', None)
+        } for item_raw in meta_data]
+        for item_, (item_id, item) in zip(meta_data, multiprocess_run_tqdm(process_item, args, desc='Preprocess')):
+            if item is not None:
+                item_.update(item)
+                item = item_
+                if 'txt_loader' in item:
+                    del item['txt_loader']
+                item['id'] = item_id
+                item['spk_name'] = item.get('spk_name', '<SINGLE_SPK>')
+                item['others'] = item.get('others', None)
+                phone_list += item['ph'].split(" ")
+                word_list += item['word'].split(" ")
+                spk_names.add(item['spk_name'])
+                items.append(item)
+        # add encoded tokens
+        ph_encoder, word_encoder = self._phone_encoder(phone_list), self._word_encoder(word_list)
+        spk_map = self.build_spk_map(spk_names)
+        args = [{
+            'ph': item['ph'], 'word': item['word'], 'spk_name': item['spk_name'],
+            'word_encoder': word_encoder, 'ph_encoder': ph_encoder, 'spk_map': spk_map
+        } for item in items]
+        for idx, item_new_kv in multiprocess_run_tqdm(self.preprocess_second_pass, args, desc='Add encoded tokens'):
+            items[idx].update(item_new_kv)
+        # build mfa data
+        if self.preprocess_args['use_mfa']:
+            mfa_dict = set()
+            mfa_input_dir = f'{processed_dir}/mfa_inputs'
+            remove_file(mfa_input_dir)
+            # group MFA inputs for better parallelism
+            mfa_groups = [i // self.preprocess_args['nsample_per_mfa_group'] for i in range(len(items))]
+            if self.preprocess_args['mfa_group_shuffle']:
+                random.seed(hparams['seed'])
+                random.shuffle(mfa_groups)
+            args = [{
+                'item': item, 'mfa_input_dir': mfa_input_dir,
+                'mfa_group': mfa_group, 'wav_processed_tmp': wav_processed_tmp_dir,
+                'preprocess_args': self.preprocess_args
+            } for item, mfa_group in zip(items, mfa_groups)]
+            for i, (ph_gb_word_nosil, new_wav_align_fn) in multiprocess_run_tqdm(
+                    self.build_mfa_inputs, args, desc='Build MFA data'):
+                items[i]['wav_align_fn'] = new_wav_align_fn
+                for w in ph_gb_word_nosil.split(" "):
+                    mfa_dict.add(f"{w} {w.replace('_', ' ')}")
+            mfa_dict = sorted(mfa_dict)
+            with open(f'{processed_dir}/mfa_dict.txt', 'w') as f:
+                f.writelines([f'{l}\n' for l in mfa_dict])
+        with open(f"{processed_dir}/{self.meta_csv_filename}.json", 'w') as f:
+            f.write(re.sub(r'\n\s+([\d+\]])', r'\1', json.dumps(items, ensure_ascii=False, sort_keys=False, indent=1)))
+        remove_file(wav_processed_tmp_dir)
+    @classmethod
+    def preprocess_first_pass(cls, item_name, txt_raw, txt_processor,
+                              wav_fn, wav_processed_dir, wav_processed_tmp,
+                              preprocess_args, txt_loader=None, others=None):
+        try:
+            if txt_loader is not None:
+                txt_raw = txt_loader(txt_raw)
+            ph, txt, word, ph2word, ph_gb_word = cls.txt_to_ph(txt_processor, txt_raw, preprocess_args)
+            wav_fn, wav_align_fn = cls.process_wav(
+                item_name, wav_fn,
+                hparams['processed_data_dir'],
+                wav_processed_tmp, preprocess_args)
+            # wav for binarization
+            ext = os.path.splitext(wav_fn)[1]
+            os.makedirs(wav_processed_dir, exist_ok=True)
+            new_wav_fn = f"{wav_processed_dir}/{item_name}{ext}"
+            move_link_func = move_file if os.path.dirname(wav_fn) == wav_processed_tmp else link_file
+            move_link_func(wav_fn, new_wav_fn)
+            return {
+                'txt': txt, 'txt_raw': txt_raw, 'ph': ph,
+                'word': word, 'ph2word': ph2word, 'ph_gb_word': ph_gb_word,
+                'wav_fn': new_wav_fn, 'wav_align_fn': wav_align_fn,
+                'others': others
+            }
+        except:
+            traceback.print_exc()
+            print(f"| Error is caught. item_name: {item_name}.")
+            return None
+    @staticmethod
+    def txt_to_ph(txt_processor, txt_raw, preprocess_args):
+        txt_struct, txt = txt_processor.process(txt_raw, preprocess_args)
+        ph = [p for w in txt_struct for p in w[1]]
+        ph_gb_word = ["_".join(w[1]) for w in txt_struct]
+        words = [w[0] for w in txt_struct]
+        # word_id=0 is reserved for padding
+        ph2word = [w_id + 1 for w_id, w in enumerate(txt_struct) for _ in range(len(w[1]))]
+        return " ".join(ph), txt, " ".join(words), ph2word, " ".join(ph_gb_word)
+    @staticmethod
+    def process_wav(item_name, wav_fn, processed_dir, wav_processed_tmp, preprocess_args):
+        processors = [get_wav_processor_cls(v) for v in preprocess_args['wav_processors']]
+        processors = [k() for k in processors if k is not None]
+        if len(processors) >= 1:
+            sr_file = librosa.core.get_samplerate(wav_fn)
+            output_fn_for_align = None
+            ext = os.path.splitext(wav_fn)[1]
+            input_fn = f"{wav_processed_tmp}/{item_name}{ext}"
+            link_file(wav_fn, input_fn)
+            for p in processors:
+                outputs = p.process(input_fn, sr_file, wav_processed_tmp, processed_dir, item_name, preprocess_args)
+                if len(outputs) == 3:
+                    input_fn, sr, output_fn_for_align = outputs
+                else:
+                    input_fn, sr = outputs
+            if output_fn_for_align is None:
+                return input_fn, input_fn
+            else:
+                return input_fn, output_fn_for_align
+        else:
+            return wav_fn, wav_fn
+    def _phone_encoder(self, ph_set):
+        ph_set_fn = f"{self.processed_dir}/phone_set.json"
+        if self.preprocess_args['reset_phone_dict'] or not os.path.exists(ph_set_fn):
+            ph_set = sorted(set(ph_set))
+            json.dump(ph_set, open(ph_set_fn, 'w'), ensure_ascii=False)
+            print("| Build phone set: ", ph_set)
+        else:
+            ph_set = json.load(open(ph_set_fn, 'r'))
+            print("| Load phone set: ", ph_set)
+        return build_token_encoder(ph_set_fn)
+    def _word_encoder(self, word_set):
+        word_set_fn = f"{self.processed_dir}/word_set.json"
+        if self.preprocess_args['reset_word_dict']:
+            word_set = Counter(word_set)
+            total_words = sum(word_set.values())
+            word_set = word_set.most_common(hparams['word_dict_size'])
+            num_unk_words = total_words - sum([x[1] for x in word_set])
+            word_set = ['<BOS>', '<EOS>'] + [x[0] for x in word_set]
+            word_set = sorted(set(word_set))
+            json.dump(word_set, open(word_set_fn, 'w'), ensure_ascii=False)
+            print(f"| Build word set. Size: {len(word_set)}, #total words: {total_words},"
+                  f" #unk_words: {num_unk_words}, word_set[:10]:, {word_set[:10]}.")
+        else:
+            word_set = json.load(open(word_set_fn, 'r'))
+            print("| Load word set. Size: ", len(word_set), word_set[:10])
+        return build_token_encoder(word_set_fn)
+    @classmethod
+    def preprocess_second_pass(cls, word, ph, spk_name, word_encoder, ph_encoder, spk_map):
+        word_token = word_encoder.encode(word)
+        ph_token = ph_encoder.encode(ph)
+        spk_id = spk_map[spk_name]
+        return {'word_token': word_token, 'ph_token': ph_token, 'spk_id': spk_id}
+    def build_spk_map(self, spk_names):
+        spk_map = {x: i for i, x in enumerate(sorted(list(spk_names)))}
+        assert len(spk_map) == 0 or len(spk_map) <= hparams['num_spk'], len(spk_map)
+        print(f"| Number of spks: {len(spk_map)}, spk_map: {spk_map}")
+        json.dump(spk_map, open(self.spk_map_fn, 'w'), ensure_ascii=False)
+        return spk_map
+    @classmethod
+    def build_mfa_inputs(cls, item, mfa_input_dir, mfa_group, wav_processed_tmp, preprocess_args):
+        item_name = item['item_name']
+        wav_align_fn = item['wav_align_fn']
+        ph_gb_word = item['ph_gb_word']
+        ext = os.path.splitext(wav_align_fn)[1]
+        mfa_input_group_dir = f'{mfa_input_dir}/{mfa_group}'
+        os.makedirs(mfa_input_group_dir, exist_ok=True)
+        new_wav_align_fn = f"{mfa_input_group_dir}/{item_name}{ext}"
+        move_link_func = move_file if os.path.dirname(wav_align_fn) == wav_processed_tmp else link_file
+        move_link_func(wav_align_fn, new_wav_align_fn)
+        ph_gb_word_nosil = " ".join(["_".join([p for p in w.split("_") if not is_sil_phoneme(p)])
+                                     for w in ph_gb_word.split(" ") if not is_sil_phoneme(w)])
+        with open(f'{mfa_input_group_dir}/{item_name}.lab', 'w') as f_txt:
+            f_txt.write(ph_gb_word_nosil)
+        return ph_gb_word_nosil, new_wav_align_fn
+    def load_spk_map(self, base_dir):
+        spk_map_fn = f"{base_dir}/spk_map.json"
+        spk_map = json.load(open(spk_map_fn, 'r'))
+        return spk_map
+    def load_dict(self, base_dir):
+        ph_encoder = build_token_encoder(f'{base_dir}/phone_set.json')
+        word_encoder = build_token_encoder(f'{base_dir}/word_set.json')
+        return ph_encoder, word_encoder
+    @property
+    def meta_csv_filename(self):
+        return 'metadata'
+    @property
+    def wav_processed_dirname(self):
+        return 'wav_processed'

NeuralSeq/data_gen/tts/binarizer_zh.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import os
+os.environ["OMP_NUM_THREADS"] = "1"
+from data_gen.tts.txt_processors.zh_g2pM import ALL_SHENMU
+from data_gen.tts.base_binarizer import BaseBinarizer, BinarizationError
+from data_gen.tts.data_gen_utils import get_mel2ph
+from utils.hparams import set_hparams, hparams
+import numpy as np
+class ZhBinarizer(BaseBinarizer):
+    @staticmethod
+    def get_align(tg_fn, ph, mel, phone_encoded, res):
+        if tg_fn is not None and os.path.exists(tg_fn):
+            _, dur = get_mel2ph(tg_fn, ph, mel, hparams)
+        else:
+            raise BinarizationError(f"Align not found")
+        ph_list = ph.split(" ")
+        assert len(dur) == len(ph_list)
+        mel2ph = []
+        # 分隔符的时长分配给韵母
+        dur_cumsum = np.pad(np.cumsum(dur), [1, 0], mode='constant', constant_values=0)
+        for i in range(len(dur)):
+            p = ph_list[i]
+            if p[0] != '<' and not p[0].isalpha():
+                uv_ = res['f0'][dur_cumsum[i]:dur_cumsum[i + 1]] == 0
+                j = 0
+                while j < len(uv_) and not uv_[j]:
+                    j += 1
+                dur[i - 1] += j
+                dur[i] -= j
+                if dur[i] < 100:
+                    dur[i - 1] += dur[i]
+                    dur[i] = 0
+        # 声母和韵母等长
+        for i in range(len(dur)):
+            p = ph_list[i]
+            if p in ALL_SHENMU:
+                p_next = ph_list[i + 1]
+                if not (dur[i] > 0 and p_next[0].isalpha() and p_next not in ALL_SHENMU):
+                    print(f"assert dur[i] > 0 and p_next[0].isalpha() and p_next not in ALL_SHENMU, "
+                          f"dur[i]: {dur[i]}, p: {p}, p_next: {p_next}.")
+                    continue
+                total = dur[i + 1] + dur[i]
+                dur[i] = total // 2
+                dur[i + 1] = total - dur[i]
+        for i in range(len(dur)):
+            mel2ph += [i + 1] * dur[i]
+        mel2ph = np.array(mel2ph)
+        if mel2ph.max() - 1 >= len(phone_encoded):
+            raise BinarizationError(f"| Align does not match: {(mel2ph.max() - 1, len(phone_encoded))}")
+        res['mel2ph'] = mel2ph
+        res['dur'] = dur
+if __name__ == "__main__":
+    set_hparams()
+    ZhBinarizer().process()

NeuralSeq/data_gen/tts/data_gen_utils.py ADDED Viewed

	@@ -0,0 +1,357 @@

+import warnings
+warnings.filterwarnings("ignore")
+import parselmouth
+import os
+import torch
+from skimage.transform import resize
+from utils.text_encoder import TokenTextEncoder
+from utils.pitch_utils import f0_to_coarse
+import struct
+import webrtcvad
+from scipy.ndimage.morphology import binary_dilation
+import librosa
+import numpy as np
+from utils import audio
+import pyloudnorm as pyln
+import re
+import json
+from collections import OrderedDict
+PUNCS = '!,.?;:'
+int16_max = (2 ** 15) - 1
+def trim_long_silences(path, sr=None, return_raw_wav=False, norm=True, vad_max_silence_length=12):
+    """
+    Ensures that segments without voice in the waveform remain no longer than a
+    threshold determined by the VAD parameters in params.py.
+    :param wav: the raw waveform as a numpy array of floats
+    :param vad_max_silence_length: Maximum number of consecutive silent frames a segment can have.
+    :return: the same waveform with silences trimmed away (length <= original wav length)
+    """
+    ## Voice Activation Detection
+    # Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
+    # This sets the granularity of the VAD. Should not need to be changed.
+    sampling_rate = 16000
+    wav_raw, sr = librosa.core.load(path, sr=sr)
+    if norm:
+        meter = pyln.Meter(sr)  # create BS.1770 meter
+        loudness = meter.integrated_loudness(wav_raw)
+        wav_raw = pyln.normalize.loudness(wav_raw, loudness, -20.0)
+        if np.abs(wav_raw).max() > 1.0:
+            wav_raw = wav_raw / np.abs(wav_raw).max()
+    wav = librosa.resample(wav_raw, sr, sampling_rate, res_type='kaiser_best')
+    vad_window_length = 30  # In milliseconds
+    # Number of frames to average together when performing the moving average smoothing.
+    # The larger this value, the larger the VAD variations must be to not get smoothed out.
+    vad_moving_average_width = 8
+    # Compute the voice detection window size
+    samples_per_window = (vad_window_length * sampling_rate) // 1000
+    # Trim the end of the audio to have a multiple of the window size
+    wav = wav[:len(wav) - (len(wav) % samples_per_window)]
+    # Convert the float waveform to 16-bit mono PCM
+    pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
+    # Perform voice activation detection
+    voice_flags = []
+    vad = webrtcvad.Vad(mode=3)
+    for window_start in range(0, len(wav), samples_per_window):
+        window_end = window_start + samples_per_window
+        voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
+                                         sample_rate=sampling_rate))
+    voice_flags = np.array(voice_flags)
+    # Smooth the voice detection with a moving average
+    def moving_average(array, width):
+        array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
+        ret = np.cumsum(array_padded, dtype=float)
+        ret[width:] = ret[width:] - ret[:-width]
+        return ret[width - 1:] / width
+    audio_mask = moving_average(voice_flags, vad_moving_average_width)
+    audio_mask = np.round(audio_mask).astype(np.bool)
+    # Dilate the voiced regions
+    audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
+    audio_mask = np.repeat(audio_mask, samples_per_window)
+    audio_mask = resize(audio_mask, (len(wav_raw),)) > 0
+    if return_raw_wav:
+        return wav_raw, audio_mask, sr
+    return wav_raw[audio_mask], audio_mask, sr
+def process_utterance(wav_path,
+                      fft_size=1024,
+                      hop_size=256,
+                      win_length=1024,
+                      window="hann",
+                      num_mels=80,
+                      fmin=80,
+                      fmax=7600,
+                      eps=1e-6,
+                      sample_rate=22050,
+                      loud_norm=False,
+                      min_level_db=-100,
+                      return_linear=False,
+                      trim_long_sil=False, vocoder='pwg'):
+    if isinstance(wav_path, str):
+        if trim_long_sil:
+            wav, _, _ = trim_long_silences(wav_path, sample_rate)
+        else:
+            wav, _ = librosa.core.load(wav_path, sr=sample_rate)
+    else:
+        wav = wav_path
+    if loud_norm:
+        meter = pyln.Meter(sample_rate)  # create BS.1770 meter
+        loudness = meter.integrated_loudness(wav)
+        wav = pyln.normalize.loudness(wav, loudness, -22.0)
+        if np.abs(wav).max() > 1:
+            wav = wav / np.abs(wav).max()
+    # get amplitude spectrogram
+    x_stft = librosa.stft(wav, n_fft=fft_size, hop_length=hop_size,
+                          win_length=win_length, window=window, pad_mode="constant")
+    spc = np.abs(x_stft)  # (n_bins, T)
+    # get mel basis
+    fmin = 0 if fmin == -1 else fmin
+    fmax = sample_rate / 2 if fmax == -1 else fmax
+    mel_basis = librosa.filters.mel(sample_rate, fft_size, num_mels, fmin, fmax)
+    mel = mel_basis @ spc
+    if vocoder == 'pwg':
+        mel = np.log10(np.maximum(eps, mel))  # (n_mel_bins, T)
+    else:
+        assert False, f'"{vocoder}" is not in ["pwg"].'
+    l_pad, r_pad = audio.librosa_pad_lr(wav, fft_size, hop_size, 1)
+    wav = np.pad(wav, (l_pad, r_pad), mode='constant', constant_values=0.0)
+    wav = wav[:mel.shape[1] * hop_size]
+    if not return_linear:
+        return wav, mel
+    else:
+        spc = audio.amp_to_db(spc)
+        spc = audio.normalize(spc, {'min_level_db': min_level_db})
+        return wav, mel, spc
+def get_pitch(wav_data, mel, hparams):
+    """
+    :param wav_data: [T]
+    :param mel: [T, 80]
+    :param hparams:
+    :return:
+    """
+    time_step = hparams['hop_size'] / hparams['audio_sample_rate'] * 1000
+    f0_min = 80
+    f0_max = 750
+    if hparams['hop_size'] == 128:
+        pad_size = 4
+    elif hparams['hop_size'] == 256:
+        pad_size = 2
+    else:
+        assert False
+    f0 = parselmouth.Sound(wav_data, hparams['audio_sample_rate']).to_pitch_ac(
+        time_step=time_step / 1000, voicing_threshold=0.6,
+        pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
+    lpad = pad_size * 2
+    rpad = len(mel) - len(f0) - lpad
+    f0 = np.pad(f0, [[lpad, rpad]], mode='constant')
+    # mel and f0 are extracted by 2 different libraries. we should force them to have the same length.
+    # Attention: we find that new version of some libraries could cause ``rpad'' to be a negetive value...
+    # Just to be sure, we recommend users to set up the same environments as them in requirements_auto.txt (by Anaconda)
+    delta_l = len(mel) - len(f0)
+    assert np.abs(delta_l) <= 8
+    if delta_l > 0:
+        f0 = np.concatenate([f0, [f0[-1]] * delta_l], 0)
+    f0 = f0[:len(mel)]
+    pitch_coarse = f0_to_coarse(f0)
+    return f0, pitch_coarse
+def remove_empty_lines(text):
+    """remove empty lines"""
+    assert (len(text) > 0)
+    assert (isinstance(text, list))
+    text = [t.strip() for t in text]
+    if "" in text:
+        text.remove("")
+    return text
+class TextGrid(object):
+    def __init__(self, text):
+        text = remove_empty_lines(text)
+        self.text = text
+        self.line_count = 0
+        self._get_type()
+        self._get_time_intval()
+        self._get_size()
+        self.tier_list = []
+        self._get_item_list()
+    def _extract_pattern(self, pattern, inc):
+        """
+        Parameters
+        ----------
+        pattern : regex to extract pattern
+        inc : increment of line count after extraction
+        Returns
+        -------
+        group : extracted info
+        """
+        try:
+            group = re.match(pattern, self.text[self.line_count]).group(1)
+            self.line_count += inc
+        except AttributeError:
+            raise ValueError("File format error at line %d:%s" % (self.line_count, self.text[self.line_count]))
+        return group
+    def _get_type(self):
+        self.file_type = self._extract_pattern(r"File type = \"(.*)\"", 2)
+    def _get_time_intval(self):
+        self.xmin = self._extract_pattern(r"xmin = (.*)", 1)
+        self.xmax = self._extract_pattern(r"xmax = (.*)", 2)
+    def _get_size(self):
+        self.size = int(self._extract_pattern(r"size = (.*)", 2))
+    def _get_item_list(self):
+        """Only supports IntervalTier currently"""
+        for itemIdx in range(1, self.size + 1):
+            tier = OrderedDict()
+            item_list = []
+            tier_idx = self._extract_pattern(r"item \[(.*)\]:", 1)
+            tier_class = self._extract_pattern(r"class = \"(.*)\"", 1)
+            if tier_class != "IntervalTier":
+                raise NotImplementedError("Only IntervalTier class is supported currently")
+            tier_name = self._extract_pattern(r"name = \"(.*)\"", 1)
+            tier_xmin = self._extract_pattern(r"xmin = (.*)", 1)
+            tier_xmax = self._extract_pattern(r"xmax = (.*)", 1)
+            tier_size = self._extract_pattern(r"intervals: size = (.*)", 1)
+            for i in range(int(tier_size)):
+                item = OrderedDict()
+                item["idx"] = self._extract_pattern(r"intervals \[(.*)\]", 1)
+                item["xmin"] = self._extract_pattern(r"xmin = (.*)", 1)
+                item["xmax"] = self._extract_pattern(r"xmax = (.*)", 1)
+                item["text"] = self._extract_pattern(r"text = \"(.*)\"", 1)
+                item_list.append(item)
+            tier["idx"] = tier_idx
+            tier["class"] = tier_class
+            tier["name"] = tier_name
+            tier["xmin"] = tier_xmin
+            tier["xmax"] = tier_xmax
+            tier["size"] = tier_size
+            tier["items"] = item_list
+            self.tier_list.append(tier)
+    def toJson(self):
+        _json = OrderedDict()
+        _json["file_type"] = self.file_type
+        _json["xmin"] = self.xmin
+        _json["xmax"] = self.xmax
+        _json["size"] = self.size
+        _json["tiers"] = self.tier_list
+        return json.dumps(_json, ensure_ascii=False, indent=2)
+def get_mel2ph(tg_fn, ph, mel, hparams):
+    ph_list = ph.split(" ")
+    with open(tg_fn, "r") as f:
+        tg = f.readlines()
+    tg = remove_empty_lines(tg)
+    tg = TextGrid(tg)
+    tg = json.loads(tg.toJson())
+    split = np.ones(len(ph_list) + 1, np.float) * -1
+    tg_idx = 0
+    ph_idx = 0
+    tg_align = [x for x in tg['tiers'][-1]['items']]
+    tg_align_ = []
+    for x in tg_align:
+        x['xmin'] = float(x['xmin'])
+        x['xmax'] = float(x['xmax'])
+        if x['text'] in ['sil', 'sp', '', 'SIL', 'PUNC']:
+            x['text'] = ''
+            if len(tg_align_) > 0 and tg_align_[-1]['text'] == '':
+                tg_align_[-1]['xmax'] = x['xmax']
+                continue
+        tg_align_.append(x)
+    tg_align = tg_align_
+    tg_len = len([x for x in tg_align if x['text'] != ''])
+    ph_len = len([x for x in ph_list if not is_sil_phoneme(x)])
+    assert tg_len == ph_len, (tg_len, ph_len, tg_align, ph_list, tg_fn)
+    while tg_idx < len(tg_align) or ph_idx < len(ph_list):
+        if tg_idx == len(tg_align) and is_sil_phoneme(ph_list[ph_idx]):
+            split[ph_idx] = 1e8
+            ph_idx += 1
+            continue
+        x = tg_align[tg_idx]
+        if x['text'] == '' and ph_idx == len(ph_list):
+            tg_idx += 1
+            continue
+        assert ph_idx < len(ph_list), (tg_len, ph_len, tg_align, ph_list, tg_fn)
+        ph = ph_list[ph_idx]
+        if x['text'] == '' and not is_sil_phoneme(ph):
+            assert False, (ph_list, tg_align)
+        if x['text'] != '' and is_sil_phoneme(ph):
+            ph_idx += 1
+        else:
+            assert (x['text'] == '' and is_sil_phoneme(ph)) \
+                   or x['text'].lower() == ph.lower() \
+                   or x['text'].lower() == 'sil', (x['text'], ph)
+            split[ph_idx] = x['xmin']
+            if ph_idx > 0 and split[ph_idx - 1] == -1 and is_sil_phoneme(ph_list[ph_idx - 1]):
+                split[ph_idx - 1] = split[ph_idx]
+            ph_idx += 1
+            tg_idx += 1
+    assert tg_idx == len(tg_align), (tg_idx, [x['text'] for x in tg_align])
+    assert ph_idx >= len(ph_list) - 1, (ph_idx, ph_list, len(ph_list), [x['text'] for x in tg_align], tg_fn)
+    mel2ph = np.zeros([mel.shape[0]], np.int)
+    split[0] = 0
+    split[-1] = 1e8
+    for i in range(len(split) - 1):
+        assert split[i] != -1 and split[i] <= split[i + 1], (split[:-1],)
+    split = [int(s * hparams['audio_sample_rate'] / hparams['hop_size'] + 0.5) for s in split]
+    for ph_idx in range(len(ph_list)):
+        mel2ph[split[ph_idx]:split[ph_idx + 1]] = ph_idx + 1
+    mel2ph_torch = torch.from_numpy(mel2ph)
+    T_t = len(ph_list)
+    dur = mel2ph_torch.new_zeros([T_t + 1]).scatter_add(0, mel2ph_torch, torch.ones_like(mel2ph_torch))
+    dur = dur[1:].numpy()
+    return mel2ph, dur
+def build_phone_encoder(data_dir):
+    phone_list_file = os.path.join(data_dir, 'phone_set.json')
+    phone_list = json.load(open(phone_list_file))
+    return TokenTextEncoder(None, vocab_list=phone_list, replace_oov=',')
+def build_word_encoder(data_dir):
+    word_list_file = os.path.join(data_dir, 'word_set.json')
+    word_list = json.load(open(word_list_file))
+    return TokenTextEncoder(None, vocab_list=word_list, replace_oov=',')
+def is_sil_phoneme(p):
+    return not p[0].isalpha()
+def build_token_encoder(token_list_file):
+    token_list = json.load(open(token_list_file))
+    return TokenTextEncoder(None, vocab_list=token_list, replace_oov='<UNK>')

NeuralSeq/data_gen/tts/emotion/__pycache__/audio.cpython-38.pyc ADDED Viewed

Binary file (3.8 kB). View file

NeuralSeq/data_gen/tts/emotion/__pycache__/inference.cpython-38.pyc ADDED Viewed

Binary file (7.28 kB). View file

NeuralSeq/data_gen/tts/emotion/__pycache__/model.cpython-38.pyc ADDED Viewed

Binary file (2.53 kB). View file

NeuralSeq/data_gen/tts/emotion/__pycache__/params_data.cpython-38.pyc ADDED Viewed

Binary file (491 Bytes). View file

NeuralSeq/data_gen/tts/emotion/__pycache__/params_model.cpython-38.pyc ADDED Viewed

Binary file (371 Bytes). View file

NeuralSeq/data_gen/tts/emotion/audio.py ADDED Viewed

	@@ -0,0 +1,107 @@

+from scipy.ndimage.morphology import binary_dilation
+from data_gen.tts.emotion.params_data import *
+from pathlib import Path
+from typing import Optional, Union
+import numpy as np
+import webrtcvad
+import librosa
+import struct
+int16_max = (2 ** 15) - 1
+def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
+                   source_sr: Optional[int] = None):
+    """
+    Applies the preprocessing operations used in training the Speaker Encoder to a waveform
+    either on disk or in memory. The waveform will be resampled to match the data hyperparameters.
+    :param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not
+    just .wav), either the waveform as a numpy array of floats.
+    :param source_sr: if passing an audio waveform, the sampling rate of the waveform before
+    preprocessing. After preprocessing, the waveform's sampling rate will match the data
+    hyperparameters. If passing a filepath, the sampling rate will be automatically detected and
+    this argument will be ignored.
+    """
+    # Load the wav from disk if needed
+    if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
+        wav, source_sr = librosa.load(str(fpath_or_wav), sr=None)
+    else:
+        wav = fpath_or_wav
+    # Resample the wav if needed
+    if source_sr is not None and source_sr != sampling_rate:
+        wav = librosa.resample(wav, source_sr, sampling_rate)
+    # Apply the preprocessing: normalize volume and shorten long silences
+    wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
+    wav = trim_long_silences(wav)
+    return wav
+def wav_to_mel_spectrogram(wav):
+    """
+    Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
+    Note: this not a log-mel spectrogram.
+    """
+    frames = librosa.feature.melspectrogram(
+        wav,
+        sampling_rate,
+        n_fft=int(sampling_rate * mel_window_length / 1000),
+        hop_length=int(sampling_rate * mel_window_step / 1000),
+        n_mels=mel_n_channels
+    )
+    return frames.astype(np.float32).T
+def trim_long_silences(wav):
+    """
+    Ensures that segments without voice in the waveform remain no longer than a
+    threshold determined by the VAD parameters in params.py.
+    :param wav: the raw waveform as a numpy array of floats
+    :return: the same waveform with silences trimmed away (length <= original wav length)
+    """
+    # Compute the voice detection window size
+    samples_per_window = (vad_window_length * sampling_rate) // 1000
+    # Trim the end of the audio to have a multiple of the window size
+    wav = wav[:len(wav) - (len(wav) % samples_per_window)]
+    # Convert the float waveform to 16-bit mono PCM
+    pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
+    # Perform voice activation detection
+    voice_flags = []
+    vad = webrtcvad.Vad(mode=3)
+    for window_start in range(0, len(wav), samples_per_window):
+        window_end = window_start + samples_per_window
+        voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
+                                         sample_rate=sampling_rate))
+    voice_flags = np.array(voice_flags)
+    # Smooth the voice detection with a moving average
+    def moving_average(array, width):
+        array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
+        ret = np.cumsum(array_padded, dtype=float)
+        ret[width:] = ret[width:] - ret[:-width]
+        return ret[width - 1:] / width
+    audio_mask = moving_average(voice_flags, vad_moving_average_width)
+    audio_mask = np.round(audio_mask).astype(np.bool)
+    # Dilate the voiced regions
+    audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
+    audio_mask = np.repeat(audio_mask, samples_per_window)
+    return wav[audio_mask == True]
+def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False):
+    if increase_only and decrease_only:
+        raise ValueError("Both increase only and decrease only are set")
+    dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav ** 2))
+    if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only):
+        return wav
+    return wav * (10 ** (dBFS_change / 20))

NeuralSeq/data_gen/tts/emotion/inference.py ADDED Viewed

	@@ -0,0 +1,177 @@

+from data_gen.tts.emotion.params_data import *
+from data_gen.tts.emotion.model import EmotionEncoder
+from data_gen.tts.emotion.audio import preprocess_wav   # We want to expose this function from here
+from matplotlib import cm
+from data_gen.tts.emotion import audio
+from pathlib import Path
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+_model = None # type: EmotionEncoder
+_device = None # type: torch.device
+def load_model(weights_fpath: Path, device=None):
+    """
+    Loads the model in memory. If this function is not explicitely called, it will be run on the
+    first call to embed_frames() with the default weights file.
+    :param weights_fpath: the path to saved model weights.
+    :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). The
+    model will be loaded and will run on this device. Outputs will however always be on the cpu.
+    If None, will default to your GPU if it"s available, otherwise your CPU.
+    """
+    # TODO: I think the slow loading of the encoder might have something to do with the device it
+    #   was saved on. Worth investigating.
+    global _model, _device
+    if device is None:
+        _device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    elif isinstance(device, str):
+        _device = torch.device(device)
+    _model = EmotionEncoder(_device, torch.device("cpu"))
+    checkpoint = torch.load(weights_fpath)
+    _model.load_state_dict(checkpoint["model_state"])
+    _model.eval()
+    print("Loaded encoder trained to step %d" % (checkpoint["step"]))
+def is_loaded():
+    return _model is not None
+def embed_frames_batch(frames_batch):
+    """
+    Computes embeddings for a batch of mel spectrogram.
+    :param frames_batch: a batch mel of spectrogram as a numpy array of float32 of shape
+    (batch_size, n_frames, n_channels)
+    :return: the embeddings as a numpy array of float32 of shape (batch_size, model_embedding_size)
+    """
+    if _model is None:
+        raise Exception("Model was not loaded. Call load_model() before inference.")
+    frames = torch.from_numpy(frames_batch).to(_device)
+    embed = _model.inference(frames).detach().cpu().numpy()
+    return embed
+def compute_partial_slices(n_samples, partial_utterance_n_frames=partials_n_frames,
+                           min_pad_coverage=0.75, overlap=0.5):
+    """
+    Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain
+    partial utterances of <partial_utterance_n_frames> each. Both the waveform and the mel
+    spectrogram slices are returned, so as to make each partial utterance waveform correspond to
+    its spectrogram. This function assumes that the mel spectrogram parameters used are those
+    defined in params_data.py.
+    The returned ranges may be indexing further than the length of the waveform. It is
+    recommended that you pad the waveform with zeros up to wave_slices[-1].stop.
+    :param n_samples: the number of samples in the waveform
+    :param partial_utterance_n_frames: the number of mel spectrogram frames in each partial
+    utterance
+    :param min_pad_coverage: when reaching the last partial utterance, it may or may not have
+    enough frames. If at least <min_pad_coverage> of <partial_utterance_n_frames> are present,
+    then the last partial utterance will be considered, as if we padded the audio. Otherwise,
+    it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial
+    utterance, this parameter is ignored so that the function always returns at least 1 slice.
+    :param overlap: by how much the partial utterance should overlap. If set to 0, the partial
+    utterances are entirely disjoint.
+    :return: the waveform slices and mel spectrogram slices as lists of array slices. Index
+    respectively the waveform and the mel spectrogram with these slices to obtain the partial
+    utterances.
+    """
+    assert 0 <= overlap < 1
+    assert 0 < min_pad_coverage <= 1
+    samples_per_frame = int((sampling_rate * mel_window_step / 1000))
+    n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
+    frame_step = max(int(np.round(partial_utterance_n_frames * (1 - overlap))), 1)
+    # Compute the slices
+    wav_slices, mel_slices = [], []
+    steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1)
+    for i in range(0, steps, frame_step):
+        mel_range = np.array([i, i + partial_utterance_n_frames])
+        wav_range = mel_range * samples_per_frame
+        mel_slices.append(slice(*mel_range))
+        wav_slices.append(slice(*wav_range))
+    # Evaluate whether extra padding is warranted or not
+    last_wav_range = wav_slices[-1]
+    coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start)
+    if coverage < min_pad_coverage and len(mel_slices) > 1:
+        mel_slices = mel_slices[:-1]
+        wav_slices = wav_slices[:-1]
+    return wav_slices, mel_slices
+def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs):
+    """
+    Computes an embedding for a single utterance.
+    # TODO: handle multiple wavs to benefit from batching on GPU
+    :param wav: a preprocessed (see audio.py) utterance waveform as a numpy array of float32
+    :param using_partials: if True, then the utterance is split in partial utterances of
+    <partial_utterance_n_frames> frames and the utterance embedding is computed from their
+    normalized average. If False, the utterance is instead computed from feeding the entire
+    spectogram to the network.
+    :param return_partials: if True, the partial embeddings will also be returned along with the
+    wav slices that correspond to the partial embeddings.
+    :param kwargs: additional arguments to compute_partial_splits()
+    :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If
+    <return_partials> is True, the partial utterances as a numpy array of float32 of shape
+    (n_partials, model_embedding_size) and the wav partials as a list of slices will also be
+    returned. If <using_partials> is simultaneously set to False, both these values will be None
+    instead.
+    """
+    # Process the entire utterance if not using partials
+    if not using_partials:
+        frames = audio.wav_to_mel_spectrogram(wav)
+        embed = embed_frames_batch(frames[None, ...])[0]
+        if return_partials:
+            return embed, None, None
+        return embed
+    # Compute where to split the utterance into partials and pad if necessary
+    wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs)
+    max_wave_length = wave_slices[-1].stop
+    if max_wave_length >= len(wav):
+        wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")
+    # Split the utterance into partials
+    frames = audio.wav_to_mel_spectrogram(wav)
+    frames_batch = np.array([frames[s] for s in mel_slices])
+    partial_embeds = embed_frames_batch(frames_batch)
+    # Compute the utterance embedding from the partial embeddings
+    raw_embed = np.mean(partial_embeds, axis=0)
+    embed = raw_embed / np.linalg.norm(raw_embed, 2)
+    if return_partials:
+        return embed, partial_embeds, wave_slices
+    return embed
+def embed_speaker(wavs, **kwargs):
+    raise NotImplemented()
+def plot_embedding_as_heatmap(embed, ax=None, title="", shape=None, color_range=(0, 0.30)):
+    if ax is None:
+        ax = plt.gca()
+    if shape is None:
+        height = int(np.sqrt(len(embed)))
+        shape = (height, -1)
+    embed = embed.reshape(shape)
+    cmap = cm.get_cmap()
+    mappable = ax.imshow(embed, cmap=cmap)
+    cbar = plt.colorbar(mappable, ax=ax, fraction=0.046, pad=0.04)
+    cbar.set_clim(*color_range)
+    ax.set_xticks([]), ax.set_yticks([])
+    ax.set_title(title)

NeuralSeq/data_gen/tts/emotion/model.py ADDED Viewed

	@@ -0,0 +1,78 @@

+from data_gen.tts.emotion.params_model import *
+from data_gen.tts.emotion.params_data import *
+from torch.nn.utils import clip_grad_norm_
+from scipy.optimize import brentq
+from torch import nn
+import numpy as np
+import torch
+class EmotionEncoder(nn.Module):
+    def __init__(self, device, loss_device):
+        super().__init__()
+        self.loss_device = loss_device
+        # Network defition
+        self.lstm = nn.LSTM(input_size=mel_n_channels,
+                            hidden_size=model_hidden_size,
+                            num_layers=model_num_layers,
+                            batch_first=True).to(device)
+        self.linear = nn.Linear(in_features=model_hidden_size,
+                                out_features=model_embedding_size).to(device)
+        self.relu = torch.nn.ReLU().to(device)
+        # Cosine similarity scaling (with fixed initial parameter values)
+        self.similarity_weight = nn.Parameter(torch.tensor([10.])).to(loss_device)
+        self.similarity_bias = nn.Parameter(torch.tensor([-5.])).to(loss_device)
+        # Loss
+        self.loss_fn = nn.CrossEntropyLoss().to(loss_device)
+    def do_gradient_ops(self):
+        # Gradient scale
+        self.similarity_weight.grad *= 0.01
+        self.similarity_bias.grad *= 0.01
+        # Gradient clipping
+        clip_grad_norm_(self.parameters(), 3, norm_type=2)
+    def forward(self, utterances, hidden_init=None):
+        """
+        Computes the embeddings of a batch of utterance spectrograms.
+        :param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape
+        (batch_size, n_frames, n_channels)
+        :param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers,
+        batch_size, hidden_size). Will default to a tensor of zeros if None.
+        :return: the embeddings as a tensor of shape (batch_size, embedding_size)
+        """
+        # Pass the input through the LSTM layers and retrieve all outputs, the final hidden state
+        # and the final cell state.
+        out, (hidden, cell) = self.lstm(utterances, hidden_init)
+        # We take only the hidden state of the last layer
+        embeds_raw = self.relu(self.linear(hidden[-1]))
+        # L2-normalize it
+        embeds = embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
+        return embeds
+    def inference(self, utterances, hidden_init=None):
+        """
+        Computes the embeddings of a batch of utterance spectrograms.
+        :param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape
+        (batch_size, n_frames, n_channels)
+        :param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers,
+        batch_size, hidden_size). Will default to a tensor of zeros if None.
+        :return: the embeddings as a tensor of shape (batch_size, embedding_size)
+        """
+        # Pass the input through the LSTM layers and retrieve all outputs, the final hidden state
+        # and the final cell state.
+        out, (hidden, cell) = self.lstm(utterances, hidden_init)
+        return hidden[-1]

NeuralSeq/data_gen/tts/emotion/params_data.py ADDED Viewed

	@@ -0,0 +1,29 @@

+## Mel-filterbank
+mel_window_length = 25  # In milliseconds
+mel_window_step = 10    # In milliseconds
+mel_n_channels = 40
+## Audio
+sampling_rate = 16000
+# Number of spectrogram frames in a partial utterance
+partials_n_frames = 160     # 1600 ms
+# Number of spectrogram frames at inference
+inference_n_frames = 80     #  800 ms
+## Voice Activation Detection
+# Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
+# This sets the granularity of the VAD. Should not need to be changed.
+vad_window_length = 30  # In milliseconds
+# Number of frames to average together when performing the moving average smoothing.
+# The larger this value, the larger the VAD variations must be to not get smoothed out.
+vad_moving_average_width = 8
+# Maximum number of consecutive silent frames a segment can have.
+vad_max_silence_length = 6
+## Audio volume normalization
+audio_norm_target_dBFS = -30

NeuralSeq/data_gen/tts/emotion/params_model.py ADDED Viewed

	@@ -0,0 +1,11 @@

+## Model parameters
+model_hidden_size = 256
+model_embedding_size = 256
+model_num_layers = 3
+## Training parameters
+learning_rate_init = 1e-4
+speakers_per_batch = 6
+utterances_per_speaker = 20

NeuralSeq/data_gen/tts/emotion/test_emotion.py ADDED Viewed

	@@ -0,0 +1,184 @@

+#!/usr/bin/env python3 -u
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Run inference for pre-processed data with a trained model.
+"""
+import logging
+import math
+import numpy, math, pdb, sys, random
+import time, os, itertools, shutil, importlib
+import argparse
+import os
+import sys
+import glob
+from sklearn import metrics
+import soundfile as sf
+#import sentencepiece as spm
+import torch
+import inference as encoder
+import torch.nn as nn
+import torch.nn.functional as F
+from pathlib import Path
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+from resemblyzer import VoiceEncoder, preprocess_wav
+def tuneThresholdfromScore(scores, labels, target_fa, target_fr=None):
+    fpr, tpr, thresholds = metrics.roc_curve(labels, scores, pos_label=1)
+    fnr = 1 - tpr
+    fnr = fnr * 100
+    fpr = fpr * 100
+    tunedThreshold = [];
+    if target_fr:
+        for tfr in target_fr:
+            idx = numpy.nanargmin(numpy.absolute((tfr - fnr)))
+            tunedThreshold.append([thresholds[idx], fpr[idx], fnr[idx]]);
+    for tfa in target_fa:
+        idx = numpy.nanargmin(numpy.absolute((tfa - fpr)))  # numpy.where(fpr<=tfa)[0][-1]
+        tunedThreshold.append([thresholds[idx], fpr[idx], fnr[idx]]);
+    idxE = numpy.nanargmin(numpy.absolute((fnr - fpr)))
+    eer = max(fpr[idxE], fnr[idxE])
+    return (tunedThreshold, eer, fpr, fnr);
+def loadWAV(filename, max_frames, evalmode=True, num_eval=10):
+    # Maximum audio length
+    max_audio = max_frames * 160 + 240
+    # Read wav file and convert to torch tensor
+    audio,sample_rate = sf.read(filename)
+    feats_v0 = torch.from_numpy(audio).float()
+    audiosize = audio.shape[0]
+    if audiosize <= max_audio:
+        shortage = math.floor((max_audio - audiosize + 1) / 2)
+        audio = numpy.pad(audio, (shortage, shortage), 'constant', constant_values=0)
+        audiosize = audio.shape[0]
+    if evalmode:
+        startframe = numpy.linspace(0, audiosize - max_audio, num=num_eval)
+    else:
+        startframe = numpy.array([numpy.int64(random.random() * (audiosize - max_audio))])
+    feats = []
+    if evalmode and max_frames == 0:
+        feats.append(audio)
+    else:
+        for asf in startframe:
+            feats.append(audio[int(asf):int(asf) + max_audio])
+    feat = numpy.stack(feats, axis=0)
+    feat = torch.FloatTensor(feat)
+    return feat;
+def evaluateFromList(listfilename, print_interval=100, test_path='', multi=False):
+    lines       = []
+    files       = []
+    feats       = {}
+    tstart      = time.time()
+    ## Read all lines
+    with open(listfilename) as listfile:
+        while True:
+            line = listfile.readline();
+            if (not line):
+                break;
+            data = line.split();
+            ## Append random label if missing
+            if len(data) == 2: data = [random.randint(0,1)] + data
+            files.append(data[1])
+            files.append(data[2])
+            lines.append(line)
+    setfiles = list(set(files))
+    setfiles.sort()
+    ## Save all features to file
+    for idx, file in enumerate(setfiles):
+        # preprocessed_wav = encoder.preprocess_wav(os.path.join(test_path,file))
+        # embed = encoder.embed_utterance(preprocessed_wav)
+        processed_wav = preprocess_wav(os.path.join(test_path,file))
+        embed = voice_encoder.embed_utterance(processed_wav)
+        torch.cuda.empty_cache()
+        ref_feat = torch.from_numpy(embed).unsqueeze(0)
+        feats[file]     = ref_feat
+        telapsed = time.time() - tstart
+        if idx % print_interval == 0:
+            sys.stdout.write("\rReading %d of %d: %.2f Hz, embedding size %d"%(idx,len(setfiles),idx/telapsed,ref_feat.size()[1]));
+    print('')
+    all_scores = [];
+    all_labels = [];
+    all_trials = [];
+    tstart = time.time()
+    ## Read files and compute all scores
+    for idx, line in enumerate(lines):
+        data = line.split();
+        ## Append random label if missing
+        if len(data) == 2: data = [random.randint(0,1)] + data
+        ref_feat = feats[data[1]]
+        com_feat = feats[data[2]]
+        ref_feat = ref_feat.cuda()
+        com_feat = com_feat.cuda()
+        # normalize feats
+        ref_feat = F.normalize(ref_feat, p=2, dim=1)
+        com_feat = F.normalize(com_feat, p=2, dim=1)
+        dist = F.pairwise_distance(ref_feat.unsqueeze(-1), com_feat.unsqueeze(-1)).detach().cpu().numpy();
+        score = -1 * numpy.mean(dist);
+        all_scores.append(score);
+        all_labels.append(int(data[0]));
+        all_trials.append(data[1]+" "+data[2])
+        if idx % print_interval == 0:
+            telapsed = time.time() - tstart
+            sys.stdout.write("\rComputing %d of %d: %.2f Hz"%(idx,len(lines),idx/telapsed));
+            sys.stdout.flush();
+    print('\n')
+    return (all_scores, all_labels, all_trials);
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser("baseline")
+    parser.add_argument("--data_root", type=str, help="", required=True)
+    parser.add_argument("--list", type=str, help="", required=True)
+    parser.add_argument("--model_dir", type=str, help="model parameters for AudioEncoder", required=True)
+    args = parser.parse_args()
+    # Load the models one by one.
+    print("Preparing the encoder...")
+    # encoder.load_model(Path(args.model_dir))
+    print("Insert the wav file name...")
+    voice_encoder = VoiceEncoder().cuda()
+    sc, lab, trials = evaluateFromList(args.list, print_interval=100, test_path=args.data_root)
+    result = tuneThresholdfromScore(sc, lab, [1, 0.1]);
+    print('EER %2.4f'%result[1])

NeuralSeq/data_gen/tts/txt_processors/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from . import en

NeuralSeq/data_gen/tts/txt_processors/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (218 Bytes). View file

NeuralSeq/data_gen/tts/txt_processors/__pycache__/base_text_processor.cpython-38.pyc ADDED Viewed

Binary file (1.9 kB). View file

NeuralSeq/data_gen/tts/txt_processors/__pycache__/en.cpython-38.pyc ADDED Viewed

Binary file (2.87 kB). View file

NeuralSeq/data_gen/tts/txt_processors/base_text_processor.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from data_gen.tts.data_gen_utils import is_sil_phoneme
+REGISTERED_TEXT_PROCESSORS = {}
+def register_txt_processors(name):
+    def _f(cls):
+        REGISTERED_TEXT_PROCESSORS[name] = cls
+        return cls
+    return _f
+def get_txt_processor_cls(name):
+    return REGISTERED_TEXT_PROCESSORS.get(name, None)
+class BaseTxtProcessor:
+    @staticmethod
+    def sp_phonemes():
+        return ['|']
+    @classmethod
+    def process(cls, txt, preprocess_args):
+        raise NotImplementedError
+    @classmethod
+    def postprocess(cls, txt_struct, preprocess_args):
+        # remove sil phoneme in head and tail
+        while len(txt_struct) > 0 and is_sil_phoneme(txt_struct[0][0]):
+            txt_struct = txt_struct[1:]
+        while len(txt_struct) > 0 and is_sil_phoneme(txt_struct[-1][0]):
+            txt_struct = txt_struct[:-1]
+        if preprocess_args['with_phsep']:
+            txt_struct = cls.add_bdr(txt_struct)
+        if preprocess_args['add_eos_bos']:
+            txt_struct = [["<BOS>", ["<BOS>"]]] + txt_struct + [["<EOS>", ["<EOS>"]]]
+        return txt_struct
+    @classmethod
+    def add_bdr(cls, txt_struct):
+        txt_struct_ = []
+        for i, ts in enumerate(txt_struct):
+            txt_struct_.append(ts)
+            if i != len(txt_struct) - 1 and \
+                    not is_sil_phoneme(txt_struct[i][0]) and not is_sil_phoneme(txt_struct[i + 1][0]):
+                txt_struct_.append(['|', ['|']])
+        return txt_struct_