jadechoghari
/

openmusic

Text-to-Audio

Diffusers

QAMDTModel

music

Model card Files Files and versions Community

jadechoghari commited on Sep 21

Commit

2a97bc5

•

1 Parent(s): a705269

Update audioldm_train/config/mos_as_token/qa_mdt.yaml

Browse files

Files changed (1) hide show

audioldm_train/config/mos_as_token/qa_mdt.yaml +6 -6

audioldm_train/config/mos_as_token/qa_mdt.yaml CHANGED Viewed

@@ -3,7 +3,7 @@ project: "audioldm"
 precision: "high"
 # TODO: change this with your project path
-base_root: "/content/qa-mdt"
 # TODO: change this with your pretrained path
 # TODO: pretrained path is also needed in "base_root/offset_pretrained_checkpoints.json"
@@ -70,12 +70,12 @@ augmentation:
   mixup: 0.0
 model:
-  target: audioldm_train.modules.latent_diffusion.ddpm.LatentDiffusion
   params:
     # Autoencoder
     first_stage_config:
       base_learning_rate: 8.0e-06
-      target: audioldm_train.modules.latent_encoder.autoencoder.AutoencoderKL
       params:
         # TODO: change it with your VAE checkpoint
         reload_from_ckpt: "./qa-mdt/checkpoints/hifi-gan/checkpoints/vae_mel_16k_64bins.ckpt"
@@ -87,7 +87,7 @@ model:
         embed_dim: *latent_embed_dim
         time_shuffle: 1
         lossconfig:
-          target: audioldm_train.losses.LPIPSWithDiscriminator
           params:
             disc_start: 50001
             kl_weight: 1000.0
@@ -133,7 +133,7 @@ model:
     unet_config:
       # TODO: choose your class, Default: MDT_MOS_AS_TOKEN
       # (Noted: the 2D-Rope, SwiGLU and the MDT are in two classes, when training with all of them, they should be changed and merged)
-      target: audioldm_train.modules.diffusionmodules.PixArt.PixArt_MDT_MOS_AS_TOKEN
       params:
         input_size : [256, 16]
       # patch_size: [16,4]
@@ -161,7 +161,7 @@ model:
       crossattn_flan_t5:
         cond_stage_key: text
         conditioning_key: crossattn
-        target: audioldm_train.conditional_models.FlanT5HiddenState
     evaluation_params:
       unconditional_guidance_scale: 3.5

 precision: "high"
 # TODO: change this with your project path
+base_root: "./qa-mdt"
 # TODO: change this with your pretrained path
 # TODO: pretrained path is also needed in "base_root/offset_pretrained_checkpoints.json"
   mixup: 0.0
 model:
+  target: qa_mdt.audioldm_train.modules.latent_diffusion.ddpm.LatentDiffusion
   params:
     # Autoencoder
     first_stage_config:
       base_learning_rate: 8.0e-06
+      target: qa_mdt.audioldm_train.modules.latent_encoder.autoencoder.AutoencoderKL
       params:
         # TODO: change it with your VAE checkpoint
         reload_from_ckpt: "./qa-mdt/checkpoints/hifi-gan/checkpoints/vae_mel_16k_64bins.ckpt"
         embed_dim: *latent_embed_dim
         time_shuffle: 1
         lossconfig:
+          target: qa_mdt.audioldm_train.losses.LPIPSWithDiscriminator
           params:
             disc_start: 50001
             kl_weight: 1000.0
     unet_config:
       # TODO: choose your class, Default: MDT_MOS_AS_TOKEN
       # (Noted: the 2D-Rope, SwiGLU and the MDT are in two classes, when training with all of them, they should be changed and merged)
+      target: qa_mdt.audioldm_train.modules.diffusionmodules.PixArt.PixArt_MDT_MOS_AS_TOKEN
       params:
         input_size : [256, 16]
       # patch_size: [16,4]
       crossattn_flan_t5:
         cond_stage_key: text
         conditioning_key: crossattn
+        target: qa_mdt.audioldm_train.conditional_models.FlanT5HiddenState
     evaluation_params:
       unconditional_guidance_scale: 3.5