mali6 commited on
Commit
3555c0a
·
verified ·
1 Parent(s): 720b7a8

Upload genau-full-l.yaml with huggingface_hub

Browse files
Files changed (1) hide show
  1. genau-full-l.yaml +12 -27
genau-full-l.yaml CHANGED
@@ -4,14 +4,14 @@ training:
4
  nodes_count: -1
5
 
6
  logging:
7
- project_name: "audioldm-snap"
8
- wandb_key: 48955513a8a3387ed6a17f75021431035150e1fe
9
- log_directory: "./log/latent_diffusion"
 
 
 
 
10
 
11
- # Saving Checkpoints
12
- # if s3 path is speicified, checkpoints be saved at S3_FOLDED/log_directory and deleted from the local folder (except the last checkpoint). Otherwise, checkpointwill be save locally indefinitely
13
- S3_BUCKET: "snap-genvid"
14
- S3_FOLDER: 'mali6/audioldm'
15
  save_checkpoint_every_n_steps: 1500
16
  save_top_k: -1
17
 
@@ -31,10 +31,10 @@ variables:
31
  batch_size: &bs 20 # TODO: change to 256
32
 
33
  data:
34
- metadata_root: "/fsx/mali6/datasets/metadata/dataset_root.json"
35
  train: ['vggsounds', 'audiocaps', 'caption_audioset', 'wavcaps_audioset_strong', 'wavcaps_bbcsound', 'wavcaps_freesound', 'wavcaps_soundbible', 'clotho', 'fsd50k']
36
- val: "audiocaps"
37
- test: "audiocaps"
38
  class_label_indices: "audioset_eval_subset"
39
  dataloader_add_ons: []
40
  augment_p : 0.0
@@ -49,6 +49,7 @@ data:
49
  - caption
50
  - best_model_w_meta_pred_caption
51
  - gt_audio_caption
 
52
  - wavcaps_caption
53
  tags:
54
  - keywords
@@ -89,17 +90,6 @@ model:
89
  params:
90
  # dataset token
91
  dataset_embed_dim: 32
92
- dataset2id:
93
- audiocaps: 0
94
- clotho: 1
95
- vggsounds: 2
96
- wavcaps_audioset_strong: 3
97
- wavcaps_bbcsound: 4
98
- wavcaps_freesound: 5
99
- wavcaps_soundbible: 6
100
- fsd50k: 7
101
- caption_audioset: 8
102
-
103
 
104
  # logging
105
  validate_uncond: False
@@ -214,16 +204,10 @@ model:
214
 
215
  # The type of positional encodings to use for the time input
216
  time_pe_type: learned
217
- # Uses a label that specifies whether the current input is a video or an image
218
- use_video_image_conditioning: False
219
- # Uses a label that specifies the framerate of the current video
220
- use_framerate_conditioning: False
221
  # Uses a label that specifies the id of the dataset from which the current input comes
222
  use_dataset_id_conditioning: True
223
  # Uses a label that specifies the resolution of the current input
224
  use_resolution_conditioning: False
225
- # If True uses the unmasked parts of the denoised input as conditioning
226
- use_denoised_input_conditioning: False
227
 
228
  # Size of the input in pixels
229
  input_size: [1, *latent_t_size, *latent_f_size] # (frames_count, height, widht)
@@ -348,6 +332,7 @@ model:
348
  wavcaps_soundbible: 6
349
  fsd50k: 7
350
  caption_audioset: 8
 
351
  unconditional: 0 # set the uncondtional to 0 for future experiments
352
 
353
 
 
4
  nodes_count: -1
5
 
6
  logging:
7
+ project_name: "genau"
8
+ wandb_key: YOUR_WANDB_KEY (check wandb.ai/authorize)
9
+ log_directory: "./run_logs/genau/train"
10
+
11
+ # (optional) if s3 path is speicified, checkpoints be saved at S3_FOLDED/log_directory and deleted from the local folder (except the last checkpoint). Otherwise, checkpointwill be save locally indefinitely
12
+ # S3_BUCKET: "YOUR_S3_BUCKET"
13
+ # S3_FOLDER: 'YOUR_S3_FOLDER'
14
 
 
 
 
 
15
  save_checkpoint_every_n_steps: 1500
16
  save_top_k: -1
17
 
 
31
  batch_size: &bs 20 # TODO: change to 256
32
 
33
  data:
34
+ metadata_root: "../dataset_preperation/data/metadata/dataset_root.json"
35
  train: ['vggsounds', 'audiocaps', 'caption_audioset', 'wavcaps_audioset_strong', 'wavcaps_bbcsound', 'wavcaps_freesound', 'wavcaps_soundbible', 'clotho', 'fsd50k']
36
+ val: "audioset"
37
+ test: "audioset"
38
  class_label_indices: "audioset_eval_subset"
39
  dataloader_add_ons: []
40
  augment_p : 0.0
 
49
  - caption
50
  - best_model_w_meta_pred_caption
51
  - gt_audio_caption
52
+ - autocap_caption
53
  - wavcaps_caption
54
  tags:
55
  - keywords
 
90
  params:
91
  # dataset token
92
  dataset_embed_dim: 32
 
 
 
 
 
 
 
 
 
 
 
93
 
94
  # logging
95
  validate_uncond: False
 
204
 
205
  # The type of positional encodings to use for the time input
206
  time_pe_type: learned
 
 
 
 
207
  # Uses a label that specifies the id of the dataset from which the current input comes
208
  use_dataset_id_conditioning: True
209
  # Uses a label that specifies the resolution of the current input
210
  use_resolution_conditioning: False
 
 
211
 
212
  # Size of the input in pixels
213
  input_size: [1, *latent_t_size, *latent_f_size] # (frames_count, height, widht)
 
332
  wavcaps_soundbible: 6
333
  fsd50k: 7
334
  caption_audioset: 8
335
+ autocap: 9
336
  unconditional: 0 # set the uncondtional to 0 for future experiments
337
 
338