jadechoghari
commited on
Commit
•
2a97bc5
1
Parent(s):
a705269
Update audioldm_train/config/mos_as_token/qa_mdt.yaml
Browse files
audioldm_train/config/mos_as_token/qa_mdt.yaml
CHANGED
@@ -3,7 +3,7 @@ project: "audioldm"
|
|
3 |
precision: "high"
|
4 |
|
5 |
# TODO: change this with your project path
|
6 |
-
base_root: "
|
7 |
|
8 |
# TODO: change this with your pretrained path
|
9 |
# TODO: pretrained path is also needed in "base_root/offset_pretrained_checkpoints.json"
|
@@ -70,12 +70,12 @@ augmentation:
|
|
70 |
mixup: 0.0
|
71 |
|
72 |
model:
|
73 |
-
target: audioldm_train.modules.latent_diffusion.ddpm.LatentDiffusion
|
74 |
params:
|
75 |
# Autoencoder
|
76 |
first_stage_config:
|
77 |
base_learning_rate: 8.0e-06
|
78 |
-
target: audioldm_train.modules.latent_encoder.autoencoder.AutoencoderKL
|
79 |
params:
|
80 |
# TODO: change it with your VAE checkpoint
|
81 |
reload_from_ckpt: "./qa-mdt/checkpoints/hifi-gan/checkpoints/vae_mel_16k_64bins.ckpt"
|
@@ -87,7 +87,7 @@ model:
|
|
87 |
embed_dim: *latent_embed_dim
|
88 |
time_shuffle: 1
|
89 |
lossconfig:
|
90 |
-
target: audioldm_train.losses.LPIPSWithDiscriminator
|
91 |
params:
|
92 |
disc_start: 50001
|
93 |
kl_weight: 1000.0
|
@@ -133,7 +133,7 @@ model:
|
|
133 |
unet_config:
|
134 |
# TODO: choose your class, Default: MDT_MOS_AS_TOKEN
|
135 |
# (Noted: the 2D-Rope, SwiGLU and the MDT are in two classes, when training with all of them, they should be changed and merged)
|
136 |
-
target: audioldm_train.modules.diffusionmodules.PixArt.PixArt_MDT_MOS_AS_TOKEN
|
137 |
params:
|
138 |
input_size : [256, 16]
|
139 |
# patch_size: [16,4]
|
@@ -161,7 +161,7 @@ model:
|
|
161 |
crossattn_flan_t5:
|
162 |
cond_stage_key: text
|
163 |
conditioning_key: crossattn
|
164 |
-
target: audioldm_train.conditional_models.FlanT5HiddenState
|
165 |
|
166 |
evaluation_params:
|
167 |
unconditional_guidance_scale: 3.5
|
|
|
3 |
precision: "high"
|
4 |
|
5 |
# TODO: change this with your project path
|
6 |
+
base_root: "./qa-mdt"
|
7 |
|
8 |
# TODO: change this with your pretrained path
|
9 |
# TODO: pretrained path is also needed in "base_root/offset_pretrained_checkpoints.json"
|
|
|
70 |
mixup: 0.0
|
71 |
|
72 |
model:
|
73 |
+
target: qa_mdt.audioldm_train.modules.latent_diffusion.ddpm.LatentDiffusion
|
74 |
params:
|
75 |
# Autoencoder
|
76 |
first_stage_config:
|
77 |
base_learning_rate: 8.0e-06
|
78 |
+
target: qa_mdt.audioldm_train.modules.latent_encoder.autoencoder.AutoencoderKL
|
79 |
params:
|
80 |
# TODO: change it with your VAE checkpoint
|
81 |
reload_from_ckpt: "./qa-mdt/checkpoints/hifi-gan/checkpoints/vae_mel_16k_64bins.ckpt"
|
|
|
87 |
embed_dim: *latent_embed_dim
|
88 |
time_shuffle: 1
|
89 |
lossconfig:
|
90 |
+
target: qa_mdt.audioldm_train.losses.LPIPSWithDiscriminator
|
91 |
params:
|
92 |
disc_start: 50001
|
93 |
kl_weight: 1000.0
|
|
|
133 |
unet_config:
|
134 |
# TODO: choose your class, Default: MDT_MOS_AS_TOKEN
|
135 |
# (Noted: the 2D-Rope, SwiGLU and the MDT are in two classes, when training with all of them, they should be changed and merged)
|
136 |
+
target: qa_mdt.audioldm_train.modules.diffusionmodules.PixArt.PixArt_MDT_MOS_AS_TOKEN
|
137 |
params:
|
138 |
input_size : [256, 16]
|
139 |
# patch_size: [16,4]
|
|
|
161 |
crossattn_flan_t5:
|
162 |
cond_stage_key: text
|
163 |
conditioning_key: crossattn
|
164 |
+
target: qa_mdt.audioldm_train.conditional_models.FlanT5HiddenState
|
165 |
|
166 |
evaluation_params:
|
167 |
unconditional_guidance_scale: 3.5
|