Upload genau-full-s.yaml with huggingface_hub
Browse files- genau-full-s.yaml +25 -23
genau-full-s.yaml
CHANGED
@@ -5,14 +5,14 @@ training:
|
|
5 |
|
6 |
|
7 |
logging:
|
8 |
-
project_name: "
|
9 |
-
wandb_key:
|
10 |
log_directory: "./run_logs/genau/train"
|
11 |
|
12 |
-
#
|
13 |
-
#
|
14 |
-
|
15 |
-
|
16 |
save_checkpoint_every_n_steps: 1500
|
17 |
save_top_k: -1
|
18 |
|
@@ -33,8 +33,8 @@ variables:
|
|
33 |
|
34 |
data:
|
35 |
train: ['vggsounds', 'audiocaps', 'caption_audioset', 'wavcaps_audioset_strong', 'wavcaps_bbcsound', 'wavcaps_freesound', 'wavcaps_soundbible', 'clotho', 'fsd50k']
|
36 |
-
val: "
|
37 |
-
test: "
|
38 |
class_label_indices: "audioset_eval_subset"
|
39 |
dataloader_add_ons: []
|
40 |
augment_p : 0.0
|
@@ -86,14 +86,14 @@ augmentation:
|
|
86 |
mixup: 0.0
|
87 |
|
88 |
model:
|
89 |
-
target:
|
90 |
params:
|
91 |
# dataset token
|
92 |
dataset_embed_dim: 32
|
93 |
|
94 |
# logging
|
95 |
-
|
96 |
-
|
97 |
num_val_sampled_timestamps: 10
|
98 |
|
99 |
# # evaluation
|
@@ -106,7 +106,7 @@ model:
|
|
106 |
# Optimizer
|
107 |
optimizer_config:
|
108 |
# Which optimizer to use
|
109 |
-
target: !module
|
110 |
# Which LR to use
|
111 |
lr: *lr
|
112 |
# The weight decay to use
|
@@ -129,10 +129,11 @@ model:
|
|
129 |
# Autoencoder
|
130 |
first_stage_config:
|
131 |
base_learning_rate: 8.0e-06
|
132 |
-
target:
|
|
|
133 |
params:
|
134 |
# reload_from_ckpt: "data/checkpoints/vae_mel_16k_64bins.ckpt"
|
135 |
-
reload_from_ckpt: "
|
136 |
sampling_rate: *sampling_rate
|
137 |
batchsize: *bs # TODO: chagne
|
138 |
monitor: val/rec_loss
|
@@ -140,8 +141,9 @@ model:
|
|
140 |
subband: 1
|
141 |
embed_dim: *latent_embed_dim
|
142 |
time_shuffle: 1
|
|
|
143 |
lossconfig:
|
144 |
-
target:
|
145 |
params:
|
146 |
disc_start: 50001
|
147 |
kl_weight: 1000.0
|
@@ -188,14 +190,14 @@ model:
|
|
188 |
|
189 |
backbone_type : fit
|
190 |
unet_config:
|
191 |
-
target:
|
192 |
|
193 |
params:
|
194 |
weight_initializer:
|
195 |
-
target: !module
|
196 |
scale: 0.57735026919 # 1/sqrt(3) from Yuwei's findings
|
197 |
|
198 |
-
fit_block_module: !module
|
199 |
context_channels: 1024
|
200 |
summary_text_embeddings_channels: 1536 # text embedding (e.g CLAP) size
|
201 |
|
@@ -231,8 +233,8 @@ model:
|
|
231 |
|
232 |
self_conditioning_ff_config: {}
|
233 |
fit_block_config:
|
234 |
-
attention_class: !module
|
235 |
-
ff_class: !module
|
236 |
|
237 |
# Dropout parameters
|
238 |
drop_units: 0.1
|
@@ -300,16 +302,16 @@ model:
|
|
300 |
film_clap_cond1:
|
301 |
cond_stage_key: text
|
302 |
conditioning_key: film
|
303 |
-
target:
|
304 |
params:
|
305 |
-
pretrained_path:
|
306 |
sampling_rate: 16000
|
307 |
embed_mode: text # or text
|
308 |
amodel: HTSAT-tiny
|
309 |
film_flan_t5_cond2:
|
310 |
cond_stage_key: text
|
311 |
conditioning_key: film
|
312 |
-
target:
|
313 |
params:
|
314 |
text_encoder_name: google/flan-t5-large # google/flan-t5-xxl
|
315 |
freeze_text_encoder: True
|
|
|
5 |
|
6 |
|
7 |
logging:
|
8 |
+
project_name: "genau"
|
9 |
+
wandb_key: YOUR_WANDB_KEY (check wandb.ai/authorize)
|
10 |
log_directory: "./run_logs/genau/train"
|
11 |
|
12 |
+
# (optional) if s3 path is speicified, checkpoints be saved at S3_FOLDED/log_directory and deleted from the local folder (except the last checkpoint). Otherwise, checkpointwill be save locally indefinitely
|
13 |
+
# S3_BUCKET: "YOUR_S3_BUCKET"
|
14 |
+
# S3_FOLDER: 'YOUR_S3_FOLDER'
|
15 |
+
|
16 |
save_checkpoint_every_n_steps: 1500
|
17 |
save_top_k: -1
|
18 |
|
|
|
33 |
|
34 |
data:
|
35 |
train: ['vggsounds', 'audiocaps', 'caption_audioset', 'wavcaps_audioset_strong', 'wavcaps_bbcsound', 'wavcaps_freesound', 'wavcaps_soundbible', 'clotho', 'fsd50k']
|
36 |
+
val: "audioset"
|
37 |
+
test: "audioset"
|
38 |
class_label_indices: "audioset_eval_subset"
|
39 |
dataloader_add_ons: []
|
40 |
augment_p : 0.0
|
|
|
86 |
mixup: 0.0
|
87 |
|
88 |
model:
|
89 |
+
target: src.models.genau_ddpm.GenAu
|
90 |
params:
|
91 |
# dataset token
|
92 |
dataset_embed_dim: 32
|
93 |
|
94 |
# logging
|
95 |
+
validate_uncond: False
|
96 |
+
validate_wo_ema: True
|
97 |
num_val_sampled_timestamps: 10
|
98 |
|
99 |
# # evaluation
|
|
|
106 |
# Optimizer
|
107 |
optimizer_config:
|
108 |
# Which optimizer to use
|
109 |
+
target: !module src.modules.optimizers.lamb.Lamb
|
110 |
# Which LR to use
|
111 |
lr: *lr
|
112 |
# The weight decay to use
|
|
|
129 |
# Autoencoder
|
130 |
first_stage_config:
|
131 |
base_learning_rate: 8.0e-06
|
132 |
+
target: src.modules.latent_encoder.autoencoder_1d.AutoencoderKL1D
|
133 |
+
|
134 |
params:
|
135 |
# reload_from_ckpt: "data/checkpoints/vae_mel_16k_64bins.ckpt"
|
136 |
+
reload_from_ckpt: "1dvae_64ch_16k_64bins"
|
137 |
sampling_rate: *sampling_rate
|
138 |
batchsize: *bs # TODO: chagne
|
139 |
monitor: val/rec_loss
|
|
|
141 |
subband: 1
|
142 |
embed_dim: *latent_embed_dim
|
143 |
time_shuffle: 1
|
144 |
+
|
145 |
lossconfig:
|
146 |
+
target: src.losses.LPIPSWithDiscriminator
|
147 |
params:
|
148 |
disc_start: 50001
|
149 |
kl_weight: 1000.0
|
|
|
190 |
|
191 |
backbone_type : fit
|
192 |
unet_config:
|
193 |
+
target: src.modules.fit.fit_audio.FIT
|
194 |
|
195 |
params:
|
196 |
weight_initializer:
|
197 |
+
target: !module src.modules.initializers.initializers.RINWeightScalerInitializer
|
198 |
scale: 0.57735026919 # 1/sqrt(3) from Yuwei's findings
|
199 |
|
200 |
+
fit_block_module: !module src.modules.fit.layers.fit_layers.FITBlockV5
|
201 |
context_channels: 1024
|
202 |
summary_text_embeddings_channels: 1536 # text embedding (e.g CLAP) size
|
203 |
|
|
|
233 |
|
234 |
self_conditioning_ff_config: {}
|
235 |
fit_block_config:
|
236 |
+
attention_class: !module src.modules.fit.layers.rin_layers.Attention
|
237 |
+
ff_class: !module src.modules.fit.layers.rin_layers.FeedForward
|
238 |
|
239 |
# Dropout parameters
|
240 |
drop_units: 0.1
|
|
|
302 |
film_clap_cond1:
|
303 |
cond_stage_key: text
|
304 |
conditioning_key: film
|
305 |
+
target: src.modules.conditional.conditional_models.CLAPAudioEmbeddingClassifierFreev2
|
306 |
params:
|
307 |
+
pretrained_path: clap_htsat_tiny
|
308 |
sampling_rate: 16000
|
309 |
embed_mode: text # or text
|
310 |
amodel: HTSAT-tiny
|
311 |
film_flan_t5_cond2:
|
312 |
cond_stage_key: text
|
313 |
conditioning_key: film
|
314 |
+
target: src.modules.conditional.conditional_models.FlanT5HiddenState
|
315 |
params:
|
316 |
text_encoder_name: google/flan-t5-large # google/flan-t5-xxl
|
317 |
freeze_text_encoder: True
|