mali6 commited on
Commit
537aa7f
·
verified ·
1 Parent(s): 3555c0a

Upload genau-full-s.yaml with huggingface_hub

Browse files
Files changed (1) hide show
  1. genau-full-s.yaml +25 -23
genau-full-s.yaml CHANGED
@@ -5,14 +5,14 @@ training:
5
 
6
 
7
  logging:
8
- project_name: "audioldm-snap"
9
- wandb_key: 48955513a8a3387ed6a17f75021431035150e1fe
10
  log_directory: "./run_logs/genau/train"
11
 
12
- # Saving Checkpoints
13
- # if s3 path is speicified, checkpoints be saved at S3_FOLDED/log_directory and deleted from the local folder (except the last checkpoint). Otherwise, checkpointwill be save locally indefinitely
14
- S3_BUCKET: "snap-genvid"
15
- S3_FOLDER: 'mali6/audioldm'
16
  save_checkpoint_every_n_steps: 1500
17
  save_top_k: -1
18
 
@@ -33,8 +33,8 @@ variables:
33
 
34
  data:
35
  train: ['vggsounds', 'audiocaps', 'caption_audioset', 'wavcaps_audioset_strong', 'wavcaps_bbcsound', 'wavcaps_freesound', 'wavcaps_soundbible', 'clotho', 'fsd50k']
36
- val: "autocap"
37
- test: "autocap"
38
  class_label_indices: "audioset_eval_subset"
39
  dataloader_add_ons: []
40
  augment_p : 0.0
@@ -86,14 +86,14 @@ augmentation:
86
  mixup: 0.0
87
 
88
  model:
89
- target: audioldm_train.modules.latent_diffusion.ddpm.LatentDiffusion
90
  params:
91
  # dataset token
92
  dataset_embed_dim: 32
93
 
94
  # logging
95
- log_uncond: False
96
- validation_wo_ema: True
97
  num_val_sampled_timestamps: 10
98
 
99
  # # evaluation
@@ -106,7 +106,7 @@ model:
106
  # Optimizer
107
  optimizer_config:
108
  # Which optimizer to use
109
- target: !module audioldm_train.modules.snapvideo.training.optimizers.lamb.Lamb
110
  # Which LR to use
111
  lr: *lr
112
  # The weight decay to use
@@ -129,10 +129,11 @@ model:
129
  # Autoencoder
130
  first_stage_config:
131
  base_learning_rate: 8.0e-06
132
- target: audioldm_train.modules.latent_encoder.autoencoder_1d.AutoencoderKL1D
 
133
  params:
134
  # reload_from_ckpt: "data/checkpoints/vae_mel_16k_64bins.ckpt"
135
- reload_from_ckpt: "/fsx/mali6/repos/AudioLDM2-training/log/vae_checkpoints/vae_64hdcheckpoint-344999.ckpt"
136
  sampling_rate: *sampling_rate
137
  batchsize: *bs # TODO: chagne
138
  monitor: val/rec_loss
@@ -140,8 +141,9 @@ model:
140
  subband: 1
141
  embed_dim: *latent_embed_dim
142
  time_shuffle: 1
 
143
  lossconfig:
144
- target: audioldm_train.losses.LPIPSWithDiscriminator
145
  params:
146
  disc_start: 50001
147
  kl_weight: 1000.0
@@ -188,14 +190,14 @@ model:
188
 
189
  backbone_type : fit
190
  unet_config:
191
- target: audioldm_train.modules.snapvideo.models.vision.backbones.fit_audio.FIT
192
 
193
  params:
194
  weight_initializer:
195
- target: !module audioldm_train.modules.snapvideo.models.initializers.rin_weight_scaler_initializer.RINWeightScalerInitializer
196
  scale: 0.57735026919 # 1/sqrt(3) from Yuwei's findings
197
 
198
- fit_block_module: !module audioldm_train.modules.snapvideo.models.vision.layers.fit_block_v5.FITBlockV5
199
  context_channels: 1024
200
  summary_text_embeddings_channels: 1536 # text embedding (e.g CLAP) size
201
 
@@ -231,8 +233,8 @@ model:
231
 
232
  self_conditioning_ff_config: {}
233
  fit_block_config:
234
- attention_class: !module audioldm_train.modules.snapvideo.models.vision.layers.rin_layers.Attention
235
- ff_class: !module audioldm_train.modules.snapvideo.models.vision.layers.rin_layers.FeedForward
236
 
237
  # Dropout parameters
238
  drop_units: 0.1
@@ -300,16 +302,16 @@ model:
300
  film_clap_cond1:
301
  cond_stage_key: text
302
  conditioning_key: film
303
- target: audioldm_train.conditional_models.CLAPAudioEmbeddingClassifierFreev2
304
  params:
305
- pretrained_path: data/checkpoints/clap_htsat_tiny.pt
306
  sampling_rate: 16000
307
  embed_mode: text # or text
308
  amodel: HTSAT-tiny
309
  film_flan_t5_cond2:
310
  cond_stage_key: text
311
  conditioning_key: film
312
- target: audioldm_train.conditional_models.FlanT5HiddenState
313
  params:
314
  text_encoder_name: google/flan-t5-large # google/flan-t5-xxl
315
  freeze_text_encoder: True
 
5
 
6
 
7
  logging:
8
+ project_name: "genau"
9
+ wandb_key: YOUR_WANDB_KEY (check wandb.ai/authorize)
10
  log_directory: "./run_logs/genau/train"
11
 
12
+ # (optional) if s3 path is speicified, checkpoints be saved at S3_FOLDED/log_directory and deleted from the local folder (except the last checkpoint). Otherwise, checkpointwill be save locally indefinitely
13
+ # S3_BUCKET: "YOUR_S3_BUCKET"
14
+ # S3_FOLDER: 'YOUR_S3_FOLDER'
15
+
16
  save_checkpoint_every_n_steps: 1500
17
  save_top_k: -1
18
 
 
33
 
34
  data:
35
  train: ['vggsounds', 'audiocaps', 'caption_audioset', 'wavcaps_audioset_strong', 'wavcaps_bbcsound', 'wavcaps_freesound', 'wavcaps_soundbible', 'clotho', 'fsd50k']
36
+ val: "audioset"
37
+ test: "audioset"
38
  class_label_indices: "audioset_eval_subset"
39
  dataloader_add_ons: []
40
  augment_p : 0.0
 
86
  mixup: 0.0
87
 
88
  model:
89
+ target: src.models.genau_ddpm.GenAu
90
  params:
91
  # dataset token
92
  dataset_embed_dim: 32
93
 
94
  # logging
95
+ validate_uncond: False
96
+ validate_wo_ema: True
97
  num_val_sampled_timestamps: 10
98
 
99
  # # evaluation
 
106
  # Optimizer
107
  optimizer_config:
108
  # Which optimizer to use
109
+ target: !module src.modules.optimizers.lamb.Lamb
110
  # Which LR to use
111
  lr: *lr
112
  # The weight decay to use
 
129
  # Autoencoder
130
  first_stage_config:
131
  base_learning_rate: 8.0e-06
132
+ target: src.modules.latent_encoder.autoencoder_1d.AutoencoderKL1D
133
+
134
  params:
135
  # reload_from_ckpt: "data/checkpoints/vae_mel_16k_64bins.ckpt"
136
+ reload_from_ckpt: "1dvae_64ch_16k_64bins"
137
  sampling_rate: *sampling_rate
138
  batchsize: *bs # TODO: chagne
139
  monitor: val/rec_loss
 
141
  subband: 1
142
  embed_dim: *latent_embed_dim
143
  time_shuffle: 1
144
+
145
  lossconfig:
146
+ target: src.losses.LPIPSWithDiscriminator
147
  params:
148
  disc_start: 50001
149
  kl_weight: 1000.0
 
190
 
191
  backbone_type : fit
192
  unet_config:
193
+ target: src.modules.fit.fit_audio.FIT
194
 
195
  params:
196
  weight_initializer:
197
+ target: !module src.modules.initializers.initializers.RINWeightScalerInitializer
198
  scale: 0.57735026919 # 1/sqrt(3) from Yuwei's findings
199
 
200
+ fit_block_module: !module src.modules.fit.layers.fit_layers.FITBlockV5
201
  context_channels: 1024
202
  summary_text_embeddings_channels: 1536 # text embedding (e.g CLAP) size
203
 
 
233
 
234
  self_conditioning_ff_config: {}
235
  fit_block_config:
236
+ attention_class: !module src.modules.fit.layers.rin_layers.Attention
237
+ ff_class: !module src.modules.fit.layers.rin_layers.FeedForward
238
 
239
  # Dropout parameters
240
  drop_units: 0.1
 
302
  film_clap_cond1:
303
  cond_stage_key: text
304
  conditioning_key: film
305
+ target: src.modules.conditional.conditional_models.CLAPAudioEmbeddingClassifierFreev2
306
  params:
307
+ pretrained_path: clap_htsat_tiny
308
  sampling_rate: 16000
309
  embed_mode: text # or text
310
  amodel: HTSAT-tiny
311
  film_flan_t5_cond2:
312
  cond_stage_key: text
313
  conditioning_key: film
314
+ target: src.modules.conditional.conditional_models.FlanT5HiddenState
315
  params:
316
  text_encoder_name: google/flan-t5-large # google/flan-t5-xxl
317
  freeze_text_encoder: True