|
{ |
|
"model_type": "autoencoder", |
|
"sample_size": 65536, |
|
"sample_rate": 44100, |
|
"audio_channels": 1, |
|
"model": { |
|
"encoder": { |
|
"type": "oobleck", |
|
"config": { |
|
"in_channels": 1, |
|
"channels": 96, |
|
"c_mults": [1, 2, 4, 8, 16], |
|
"strides": [2, 4, 4, 8, 8], |
|
"latent_dim": 64, |
|
"use_snake": true |
|
} |
|
}, |
|
"decoder": { |
|
"type": "oobleck", |
|
"config": { |
|
"out_channels": 1, |
|
"channels": 96, |
|
"c_mults": [1, 2, 4, 8, 16], |
|
"strides": [2, 4, 4, 8, 8], |
|
"latent_dim": 64, |
|
"use_snake": true, |
|
"final_tanh": false |
|
} |
|
}, |
|
"bottleneck": { |
|
"type": "dac_rvq", |
|
"config": { |
|
"input_dim": 64, |
|
"n_codebooks": 9, |
|
"codebook_size": 1024, |
|
"codebook_dim": 8, |
|
"quantizer_dropout": 1.0 |
|
} |
|
}, |
|
"latent_dim": 64, |
|
"downsampling_ratio": 2048, |
|
"io_channels": 1 |
|
}, |
|
"training": { |
|
"learning_rate": 1.5e-4, |
|
"warmup_steps": 0, |
|
"use_ema": true, |
|
"optimizer_configs": { |
|
"autoencoder": { |
|
"optimizer": { |
|
"type": "AdamW", |
|
"config": { |
|
"betas": [0.8, 0.99], |
|
"lr": 1.5e-4, |
|
"weight_decay": 1e-3 |
|
} |
|
}, |
|
"scheduler": { |
|
"type": "InverseLR", |
|
"config": { |
|
"inv_gamma": 200000, |
|
"power": 0.5, |
|
"warmup": 0.999 |
|
} |
|
} |
|
}, |
|
"discriminator": { |
|
"optimizer": { |
|
"type": "AdamW", |
|
"config": { |
|
"betas": [0.8, 0.99], |
|
"lr": 3e-4, |
|
"weight_decay": 1e-3 |
|
} |
|
}, |
|
"scheduler": { |
|
"type": "InverseLR", |
|
"config": { |
|
"inv_gamma": 200000, |
|
"power": 0.5, |
|
"warmup": 0.999 |
|
} |
|
} |
|
} |
|
}, |
|
"loss_configs": { |
|
"discriminator": { |
|
"type": "encodec", |
|
"config": { |
|
"filters": 64, |
|
"n_ffts": [2048, 1024, 512, 256, 128], |
|
"hop_lengths": [512, 256, 128, 64, 32], |
|
"win_lengths": [2048, 1024, 512, 256, 128] |
|
}, |
|
"weights": { |
|
"adversarial": 0.1, |
|
"feature_matching": 5.0 |
|
} |
|
}, |
|
"spectral": { |
|
"type": "mrstft", |
|
"config": { |
|
"fft_sizes": [2048, 1024, 512, 256, 128, 64, 32], |
|
"hop_sizes": [512, 256, 128, 64, 32, 16, 8], |
|
"win_lengths": [2048, 1024, 512, 256, 128, 64, 32], |
|
"perceptual_weighting": true |
|
}, |
|
"weights": { |
|
"mrstft": 1.0 |
|
} |
|
}, |
|
"time": { |
|
"type": "l1", |
|
"weights": { |
|
"l1": 0.0 |
|
} |
|
} |
|
}, |
|
"demo": { |
|
"demo_every": 2000 |
|
} |
|
} |
|
} |