MNP-SVC-VCTK-partial / config.yaml
TylorShine's picture
Upload 3 files
ea1f21b verified
data:
block_size: 512
dataset_path: ../datasets/vctk-partial
duration: 1.8
encoder: dpwavlmbase
encoder_ckpt: models/pretrained/dphubert/DPWavLM-sp0.75.pth
encoder_hop_size: 320
encoder_out_channels: 768
encoder_sample_rate: 16000
extensions:
- wav
f0_extractor: rmvpe
f0_max: 1200
f0_min: 65
sampling_rate: 44100
spk_embed_channels: 256
spk_embed_encoder: pyannote.audio
spk_embed_encoder_ckpt: ./models/pretrained/pyannote.audio/wespeaker-voxceleb-resnet34-LM/pytorch_model.bin
spk_embed_encoder_sample_rate: 16000
volume_window_size: 8
device: cuda
env:
expdir: ../datasets/exp/vctk-partial
gpu_id: 0
loss:
beta: 0.8
fft_max: 2048
fft_min: 256
n_scale: 4
overlap: 0.5
use_dual_scale: false
use_dual_scale_log_freq: true
model:
f0_input_variance: 0.0
f0_offset_size_downsamples: 8
harmonic_env_size_downsamples: 8
no_use_embed_conv: false
noise_env_size_downsamples: 8
noise_seed: 289
noise_to_harmonic_phase: true
type: CombSubMinimumNoisedPhase
units_hidden_channels: 256
units_layers:
- - 10
- 11
use_f0_offset: true
use_harmonic_env: false
use_noise_env: true
use_speaker_embed: true
win_length: 2048
train:
amp_dtype: fp32
batch_size: 48
cache_all_data: true
cache_device: cuda
cache_fp16: true
epochs: 50000
frame_hop_random_max: 64
frame_hop_random_min: 32
interval_log: 10
interval_val: 2000
loss_variation: 0.1
low_similar_loss_variation: 0.7
lr: 0.0005
num_workers: 2
only_u2c_stack: false
save_opt: false
sched_cooldown: 2
sched_factor: 0.5
sched_min_lr: 3.0e-06
sched_patience: 30
sched_threshold: 1.0e-05
sched_threshold_mode: rel
weight_decay: 0