data: block_size: 512 dataset_path: ../datasets/vctk-partial duration: 1.8 encoder: dpwavlmbase encoder_ckpt: models/pretrained/dphubert/DPWavLM-sp0.75.pth encoder_hop_size: 320 encoder_out_channels: 768 encoder_sample_rate: 16000 extensions: - wav f0_extractor: rmvpe f0_max: 1200 f0_min: 65 sampling_rate: 44100 spk_embed_channels: 256 spk_embed_encoder: pyannote.audio spk_embed_encoder_ckpt: ./models/pretrained/pyannote.audio/wespeaker-voxceleb-resnet34-LM/pytorch_model.bin spk_embed_encoder_sample_rate: 16000 volume_window_size: 8 device: cuda env: expdir: ../datasets/exp/vctk-partial gpu_id: 0 loss: beta: 0.8 fft_max: 2048 fft_min: 256 n_scale: 4 overlap: 0.5 use_dual_scale: false use_dual_scale_log_freq: true model: f0_input_variance: 0.0 f0_offset_size_downsamples: 8 harmonic_env_size_downsamples: 8 no_use_embed_conv: false noise_env_size_downsamples: 8 noise_seed: 289 noise_to_harmonic_phase: true type: CombSubMinimumNoisedPhase units_hidden_channels: 256 units_layers: - - 10 - 11 use_f0_offset: true use_harmonic_env: false use_noise_env: true use_speaker_embed: true win_length: 2048 train: amp_dtype: fp32 batch_size: 48 cache_all_data: true cache_device: cuda cache_fp16: true epochs: 50000 frame_hop_random_max: 64 frame_hop_random_min: 32 interval_log: 10 interval_val: 2000 loss_variation: 0.1 low_similar_loss_variation: 0.7 lr: 0.0005 num_workers: 2 only_u2c_stack: false save_opt: false sched_cooldown: 2 sched_factor: 0.5 sched_min_lr: 3.0e-06 sched_patience: 30 sched_threshold: 1.0e-05 sched_threshold_mode: rel weight_decay: 0