|
{ |
|
"base_config": "egs/tta/audioldm/exp_config_base.json", |
|
"dataset": [ |
|
"AudioCaps" |
|
], |
|
"preprocess": { |
|
|
|
"processed_dir": "data", |
|
|
|
|
|
"use_spkid": false, |
|
"use_uv": false, |
|
"use_frame_pitch": false, |
|
"use_phone_pitch": false, |
|
"use_frame_energy": false, |
|
"use_phone_energy": false, |
|
"use_mel": false, |
|
"use_audio": false, |
|
"use_label": false, |
|
"use_one_hot": false, |
|
|
|
"use_caption": true, |
|
"use_melspec": true, |
|
"use_wav": false, |
|
|
|
"melspec_dir": "mel", |
|
"wav_dir": "wav" |
|
}, |
|
|
|
"log_dir": "ckpts/tta", |
|
|
|
|
|
"model": { |
|
"audioldm": { |
|
"image_size": 32, |
|
"in_channels": 4, |
|
"out_channels": 4, |
|
"model_channels": 256, |
|
"attention_resolutions": [4, 2, 1], |
|
"num_res_blocks": 2, |
|
"channel_mult": [1, 2, 4], |
|
"num_heads": 8, |
|
"use_spatial_transformer": true, |
|
"transformer_depth": 1, |
|
"context_dim": 768, |
|
"use_checkpoint": true, |
|
"legacy": false |
|
}, |
|
"autoencoderkl": { |
|
"ch": 128, |
|
"ch_mult": [1,2,2,4], |
|
"num_res_blocks": 2, |
|
"in_channels": 1, |
|
"z_channels": 4, |
|
"out_ch": 1, |
|
"double_z": true |
|
}, |
|
"noise_scheduler": { |
|
"num_train_timesteps": 1000, |
|
"beta_start": 0.00085, |
|
"beta_end": 0.012, |
|
"beta_schedule": "scaled_linear", |
|
"clip_sample": false, |
|
"steps_offset": 1, |
|
"set_alpha_to_one": false, |
|
"skip_prk_steps": true, |
|
"prediction_type": "epsilon" |
|
}, |
|
"autoencoder_path": "ckpts/tta/autoencoder_kl_debug_latent_size_4_10_78/checkpoints/step-0390000_loss-0.2876.pt" |
|
}, |
|
|
|
|
|
"train": { |
|
"adam": { |
|
"lr": 2.0e-5 |
|
}, |
|
"ddp": false, |
|
"random_seed": 12345, |
|
"batch_size": 12, |
|
"epochs": 50000, |
|
"max_steps": 1000000, |
|
"total_training_steps": 800000, |
|
"save_summary_steps": 1000, |
|
"save_checkpoints_steps": 5000, |
|
"valid_interval": 5000, |
|
"keep_checkpoint_max": 100 |
|
} |
|
} |