{ "model_type": "autoencoder", "sample_size": 65536, "sample_rate": 44100, "audio_channels": 1, "model": { "encoder": { "type": "oobleck", "config": { "in_channels": 1, "channels": 96, "c_mults": [1, 2, 4, 8, 16], "strides": [2, 4, 4, 8, 8], "latent_dim": 64, "use_snake": true } }, "decoder": { "type": "oobleck", "config": { "out_channels": 1, "channels": 96, "c_mults": [1, 2, 4, 8, 16], "strides": [2, 4, 4, 8, 8], "latent_dim": 64, "use_snake": true, "final_tanh": false } }, "bottleneck": { "type": "dac_rvq", "config": { "input_dim": 64, "n_codebooks": 9, "codebook_size": 1024, "codebook_dim": 8, "quantizer_dropout": 1.0 } }, "latent_dim": 64, "downsampling_ratio": 2048, "io_channels": 1 }, "training": { "learning_rate": 1.5e-4, "warmup_steps": 0, "use_ema": true, "optimizer_configs": { "autoencoder": { "optimizer": { "type": "AdamW", "config": { "betas": [0.8, 0.99], "lr": 1.5e-4, "weight_decay": 1e-3 } }, "scheduler": { "type": "InverseLR", "config": { "inv_gamma": 200000, "power": 0.5, "warmup": 0.999 } } }, "discriminator": { "optimizer": { "type": "AdamW", "config": { "betas": [0.8, 0.99], "lr": 3e-4, "weight_decay": 1e-3 } }, "scheduler": { "type": "InverseLR", "config": { "inv_gamma": 200000, "power": 0.5, "warmup": 0.999 } } } }, "loss_configs": { "discriminator": { "type": "encodec", "config": { "filters": 64, "n_ffts": [2048, 1024, 512, 256, 128], "hop_lengths": [512, 256, 128, 64, 32], "win_lengths": [2048, 1024, 512, 256, 128] }, "weights": { "adversarial": 0.1, "feature_matching": 5.0 } }, "spectral": { "type": "mrstft", "config": { "fft_sizes": [2048, 1024, 512, 256, 128, 64, 32], "hop_sizes": [512, 256, 128, 64, 32, 16, 8], "win_lengths": [2048, 1024, 512, 256, 128, 64, 32], "perceptual_weighting": true }, "weights": { "mrstft": 1.0 } }, "time": { "type": "l1", "weights": { "l1": 0.0 } } }, "demo": { "demo_every": 2000 } } }