|
data: |
|
block_size: 512 |
|
dataset_path: ../datasets/vctk-partial |
|
duration: 1.8 |
|
encoder: dpwavlmbase |
|
encoder_ckpt: models/pretrained/dphubert/DPWavLM-sp0.75.pth |
|
encoder_hop_size: 320 |
|
encoder_out_channels: 768 |
|
encoder_sample_rate: 16000 |
|
extensions: |
|
- wav |
|
f0_extractor: rmvpe |
|
f0_max: 1200 |
|
f0_min: 65 |
|
sampling_rate: 44100 |
|
spk_embed_channels: 256 |
|
spk_embed_encoder: pyannote.audio |
|
spk_embed_encoder_ckpt: ./models/pretrained/pyannote.audio/wespeaker-voxceleb-resnet34-LM/pytorch_model.bin |
|
spk_embed_encoder_sample_rate: 16000 |
|
volume_window_size: 8 |
|
device: cuda |
|
env: |
|
expdir: ../datasets/exp/vctk-partial |
|
gpu_id: 0 |
|
loss: |
|
beta: 0.8 |
|
fft_max: 2048 |
|
fft_min: 256 |
|
n_scale: 4 |
|
overlap: 0.5 |
|
use_dual_scale: false |
|
use_dual_scale_log_freq: true |
|
model: |
|
f0_input_variance: 0.0 |
|
f0_offset_size_downsamples: 8 |
|
harmonic_env_size_downsamples: 8 |
|
no_use_embed_conv: false |
|
noise_env_size_downsamples: 8 |
|
noise_seed: 289 |
|
noise_to_harmonic_phase: true |
|
type: CombSubMinimumNoisedPhase |
|
units_hidden_channels: 256 |
|
units_layers: |
|
- - 10 |
|
- 11 |
|
use_f0_offset: true |
|
use_harmonic_env: false |
|
use_noise_env: true |
|
use_speaker_embed: true |
|
win_length: 2048 |
|
train: |
|
amp_dtype: fp32 |
|
batch_size: 48 |
|
cache_all_data: true |
|
cache_device: cuda |
|
cache_fp16: true |
|
epochs: 50000 |
|
frame_hop_random_max: 64 |
|
frame_hop_random_min: 32 |
|
interval_log: 10 |
|
interval_val: 2000 |
|
loss_variation: 0.1 |
|
low_similar_loss_variation: 0.7 |
|
lr: 0.0005 |
|
num_workers: 2 |
|
only_u2c_stack: false |
|
save_opt: false |
|
sched_cooldown: 2 |
|
sched_factor: 0.5 |
|
sched_min_lr: 3.0e-06 |
|
sched_patience: 30 |
|
sched_threshold: 1.0e-05 |
|
sched_threshold_mode: rel |
|
weight_decay: 0 |
|
|