File size: 2,434 Bytes
98f685a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
base_config:
  - egs/egs_bases/tts/fs2.yaml
  - egs/datasets/audio/emotion/base_text2mel.yaml

task_cls: modules.GenerSpeech.task.generspeech.GenerSpeechTask

# emotion encoder
emotion_encoder_path: checkpoints/Emotion_encoder.pt # set the emotion encoder path

# vocoder
vocoder: hifigan
vocoder_ckpt: checkpoints/trainset_hifigan

# dataset
raw_data_dir: 'data/raw/training_set'
processed_data_dir: 'data/processed/training_set'
binary_data_dir: 'data/binary/training_set'
test_input_dir: ''

# process
binarizer_cls: data_gen.tts.base_binarizer_emotion.EmotionBinarizer
audio_sample_rate: 16000
hop_size: 256  # For 22050Hz, 275 ~= 12.5 ms (0.0125 * sample_rate)
win_size: 1024  # For 22050Hz, 1100 ~= 50 ms (If None, win_size: fft_size) (0.05 * sample_rate)
fmin: 80  # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To test depending on dataset. Pitch info: male~[65, 260], female~[100, 525])
fmax: 7600  # To be increased/reduced depending on data.
fft_size: 1024  # Extra window size is filled with 0 paddings to match this parameter
min_level_db: -100
ref_level_db: 20

binarization_args:
  reset_phone_dict: true
  reset_word_dict: true
  shuffle: true
  trim_eos_bos: false
  trim_sil: false
  with_align: true
  with_f0: true
  with_f0cwt: false
  with_linear: false
  with_spk_embed: true
  with_spk_id: true
  with_txt: true
  with_wav: true
  with_word: true

preprocess_cls: egs.datasets.audio.libritts.pre_align.LibrittsPreAlign
preprocess_args:
  nsample_per_mfa_group: 1000
  # text process
  txt_processor: en
  use_mfa: true
  with_phsep: true
  reset_phone_dict: true
  reset_word_dict: true
  add_eos_bos: true
  # mfa
  mfa_group_shuffle: false
  mfa_offset: 0.02
  # wav processors
  wav_processors: []
  save_sil_mask: true
  vad_max_silence_length: 12

# data
word_dict_size: 10000
num_spk: 500
use_spk_embed: true
use_spk_id: false
use_word: true
use_emotion: true
use_gt_dur: false
ref_audio: ''
text: ''

# training
num_sanity_val_steps: -1
max_updates: 300000
max_sentences: 100000
num_test_samples: 72

## glow
post_glow_hidden: 128
post_glow_kernel_size: 3
post_glow_n_blocks: 8
post_glow_n_block_layers: 3
share_wn_layers: 4
sigmoid_scale: false
post_share_cond_layers: false
use_txt_cond: true
use_latent_cond: true
noise_scale: 0.8

# prosody extractor
lambda_commit: 0.25
vq_start: 20500
vae_dropout: 0.0
nVQ: 128
forcing: 20000
crop: false
predictor_grad: 1.0