{ "datasets": [ { "formatter": "kokoro", "path": "DEFINE THIS", "meta_file_train": "metadata.csv", "meta_file_val": null } ], "audio": { "fft_size": 1024, "win_length": 1024, "hop_length": 256, "frame_length_ms": null, "frame_shift_ms": null, "sample_rate": 22050, "preemphasis": 0.0, "ref_level_db": 20, "do_trim_silence": true, "trim_db": 60, "power": 1.5, "griffin_lim_iters": 60, "num_mels": 80, "mel_fmin": 50.0, "mel_fmax": 7600.0, "spec_gain": 1, "signal_norm": true, "min_level_db": -100, "symmetric_norm": true, "max_norm": 4.0, "clip_norm": true, "stats_path": "scale_stats.npy" }, "gst":{ "gst_style_input": null, "gst_embedding_dim": 512, "gst_num_heads": 4, "gst_style_tokens": 10, "gst_use_speaker_embedding": false }, "model": "Tacotron2", "run_name": "kokoro-ddc", "run_description": "tacotron2 with DDC and differential spectral loss.", "batch_size": 32, "eval_batch_size": 16, "mixed_precision": true, "distributed": { "backend": "nccl", "url": "tcp:\/\/localhost:54321" }, "reinit_layers": [], "loss_masking": true, "decoder_loss_alpha": 0.5, "postnet_loss_alpha": 0.25, "postnet_diff_spec_alpha": 0.25, "decoder_diff_spec_alpha": 0.25, "decoder_ssim_alpha": 0.5, "postnet_ssim_alpha": 0.25, "ga_alpha": 5.0, "stopnet_pos_weight": 15.0, "run_eval": true, "test_delay_epochs": 10, "test_sentences_file": null, "noam_schedule": false, "grad_clip": 1.0, "epochs": 1000, "lr": 0.0001, "wd": 0.000001, "warmup_steps": 4000, "seq_len_norm": false, "memory_size": -1, "prenet_type": "original", "prenet_dropout": true, "attention_type": "original", "windowing": false, "use_forward_attn": false, "forward_attn_mask": false, "transition_agent": false, "location_attn": true, "bidirectional_decoder": false, "double_decoder_consistency": true, "ddc_r": 7, "attention_heads": 4, "attention_norm": "sigmoid", "r": 7, "gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]], "stopnet": true, "separate_stopnet": true, "print_step": 25, "tb_plot_step": 100, "print_eval": false, "save_step": 10000, "checkpoint": true, "keep_all_best": false, "keep_after": 10000, "tb_model_param_stats": false, "text_cleaner": "basic_cleaners", "enable_eos_bos_chars": false, "num_loader_workers": 4, "num_val_loader_workers": 4, "batch_group_size": 4, "min_seq_len": 6, "max_seq_len": 153, "compute_input_seq_cache": false, "use_noise_augment": true, "output_path": "DEFINE THIS", "phoneme_cache_path": "DEFINE THIS", "use_phonemes": true, "phoneme_language": "ja-jp", "characters": { "pad": "_", "eos": "~", "bos": "^", "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ", "punctuations": "!'(),-.:;? ", "phonemes": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" }, "use_speaker_embedding": false, "use_gst": false, "use_external_speaker_embedding_file": false, "external_speaker_embedding_file": "../../speakers-vctk-en.json" }