model: base_learning_rate: 3.0e-06 target: ldm.models.diffusion.cfm1_audio.CFM params: linear_start: 0.00085 linear_end: 0.012 num_timesteps_cond: 1 log_every_t: 200 timesteps: 1000 first_stage_key: image cond_stage_key: caption mel_dim: 20 mel_length: 256 channels: 0 cond_stage_trainable: True conditioning_key: crossattn monitor: val/loss_simple_ema scale_by_std: true use_ema: false scheduler_config: target: ldm.lr_scheduler.LambdaLinearScheduler params: warm_up_steps: - 10000 cycle_lengths: - 10000000000000 f_start: - 1.0e-06 f_max: - 1.0 f_min: - 1.0 unet_config: target: ldm.modules.diffusionmodules.flag_large_dit.TxtFlagLargeImprovedDiTV2 params: in_channels: 20 context_dim: 1024 hidden_size: 768 num_heads: 32 depth: 16 max_len: 1000 first_stage_config: target: ldm.models.autoencoder1d.AutoencoderKL params: embed_dim: 20 monitor: val/rec_loss ckpt_path: useful_ckpts/maa2/maa2.ckpt ddconfig: double_z: true in_channels: 80 out_ch: 80 z_channels: 20 kernel_size: 5 ch: 384 ch_mult: - 1 - 2 - 4 num_res_blocks: 2 attn_layers: - 3 down_layers: - 0 dropout: 0.0 lossconfig: target: torch.nn.Identity cond_stage_config: target: ldm.modules.encoders.modules.FrozenCLAPFLANEmbedder params: weights_path: useful_ckpts/CLAP/CLAP_weights_2022.pth lightning: callbacks: image_logger: target: main.AudioLogger params: sample_rate: 16000 for_specs: true increase_log_steps: false batch_frequency: 5000 max_images: 8 melvmin: -5 melvmax: 1.5 vocoder_cfg: target: vocoder.bigvgan.models.VocoderBigVGAN params: ckpt_vocoder: useful_ckpts/bigvnat trainer: benchmark: True gradient_clip_val: 1.0 limit_val_batches: 0.0 modelcheckpoint: params: monitor: epoch mode: max save_top_k: 10 every_n_epochs: 5 data: target: main.DataModuleFromConfig params: batch_size: 16 num_workers: 20 wrap: True train: target: ldm.data.txt_spec_dataset.spec_join_Dataset_Train params: dataset_cfg: dataset_name: audiocaps_train_16000_struct2.tsv spec_crop_len: 624 drop: 0.2 validation: target: ldm.data.txt_spec_dataset.spec_join_Dataset_Valid params: dataset_cfg: dataset_name: audiocaps_train_16000_struct2.tsv spec_crop_len: 624 drop: 0.0 test_dataset: target: ldm.data.joinaudiodataset_struct_sample_anylen.TestManifest params: manifest: data/audiocaps_test_struct.tsv spec_crop_len: 624