Upload genau-full-l.yaml with huggingface_hub
Browse files- genau-full-l.yaml +12 -27
genau-full-l.yaml
CHANGED
@@ -4,14 +4,14 @@ training:
|
|
4 |
nodes_count: -1
|
5 |
|
6 |
logging:
|
7 |
-
project_name: "
|
8 |
-
wandb_key:
|
9 |
-
log_directory: "./
|
|
|
|
|
|
|
|
|
10 |
|
11 |
-
# Saving Checkpoints
|
12 |
-
# if s3 path is speicified, checkpoints be saved at S3_FOLDED/log_directory and deleted from the local folder (except the last checkpoint). Otherwise, checkpointwill be save locally indefinitely
|
13 |
-
S3_BUCKET: "snap-genvid"
|
14 |
-
S3_FOLDER: 'mali6/audioldm'
|
15 |
save_checkpoint_every_n_steps: 1500
|
16 |
save_top_k: -1
|
17 |
|
@@ -31,10 +31,10 @@ variables:
|
|
31 |
batch_size: &bs 20 # TODO: change to 256
|
32 |
|
33 |
data:
|
34 |
-
metadata_root: "/
|
35 |
train: ['vggsounds', 'audiocaps', 'caption_audioset', 'wavcaps_audioset_strong', 'wavcaps_bbcsound', 'wavcaps_freesound', 'wavcaps_soundbible', 'clotho', 'fsd50k']
|
36 |
-
val: "
|
37 |
-
test: "
|
38 |
class_label_indices: "audioset_eval_subset"
|
39 |
dataloader_add_ons: []
|
40 |
augment_p : 0.0
|
@@ -49,6 +49,7 @@ data:
|
|
49 |
- caption
|
50 |
- best_model_w_meta_pred_caption
|
51 |
- gt_audio_caption
|
|
|
52 |
- wavcaps_caption
|
53 |
tags:
|
54 |
- keywords
|
@@ -89,17 +90,6 @@ model:
|
|
89 |
params:
|
90 |
# dataset token
|
91 |
dataset_embed_dim: 32
|
92 |
-
dataset2id:
|
93 |
-
audiocaps: 0
|
94 |
-
clotho: 1
|
95 |
-
vggsounds: 2
|
96 |
-
wavcaps_audioset_strong: 3
|
97 |
-
wavcaps_bbcsound: 4
|
98 |
-
wavcaps_freesound: 5
|
99 |
-
wavcaps_soundbible: 6
|
100 |
-
fsd50k: 7
|
101 |
-
caption_audioset: 8
|
102 |
-
|
103 |
|
104 |
# logging
|
105 |
validate_uncond: False
|
@@ -214,16 +204,10 @@ model:
|
|
214 |
|
215 |
# The type of positional encodings to use for the time input
|
216 |
time_pe_type: learned
|
217 |
-
# Uses a label that specifies whether the current input is a video or an image
|
218 |
-
use_video_image_conditioning: False
|
219 |
-
# Uses a label that specifies the framerate of the current video
|
220 |
-
use_framerate_conditioning: False
|
221 |
# Uses a label that specifies the id of the dataset from which the current input comes
|
222 |
use_dataset_id_conditioning: True
|
223 |
# Uses a label that specifies the resolution of the current input
|
224 |
use_resolution_conditioning: False
|
225 |
-
# If True uses the unmasked parts of the denoised input as conditioning
|
226 |
-
use_denoised_input_conditioning: False
|
227 |
|
228 |
# Size of the input in pixels
|
229 |
input_size: [1, *latent_t_size, *latent_f_size] # (frames_count, height, widht)
|
@@ -348,6 +332,7 @@ model:
|
|
348 |
wavcaps_soundbible: 6
|
349 |
fsd50k: 7
|
350 |
caption_audioset: 8
|
|
|
351 |
unconditional: 0 # set the uncondtional to 0 for future experiments
|
352 |
|
353 |
|
|
|
4 |
nodes_count: -1
|
5 |
|
6 |
logging:
|
7 |
+
project_name: "genau"
|
8 |
+
wandb_key: YOUR_WANDB_KEY (check wandb.ai/authorize)
|
9 |
+
log_directory: "./run_logs/genau/train"
|
10 |
+
|
11 |
+
# (optional) if s3 path is speicified, checkpoints be saved at S3_FOLDED/log_directory and deleted from the local folder (except the last checkpoint). Otherwise, checkpointwill be save locally indefinitely
|
12 |
+
# S3_BUCKET: "YOUR_S3_BUCKET"
|
13 |
+
# S3_FOLDER: 'YOUR_S3_FOLDER'
|
14 |
|
|
|
|
|
|
|
|
|
15 |
save_checkpoint_every_n_steps: 1500
|
16 |
save_top_k: -1
|
17 |
|
|
|
31 |
batch_size: &bs 20 # TODO: change to 256
|
32 |
|
33 |
data:
|
34 |
+
metadata_root: "../dataset_preperation/data/metadata/dataset_root.json"
|
35 |
train: ['vggsounds', 'audiocaps', 'caption_audioset', 'wavcaps_audioset_strong', 'wavcaps_bbcsound', 'wavcaps_freesound', 'wavcaps_soundbible', 'clotho', 'fsd50k']
|
36 |
+
val: "audioset"
|
37 |
+
test: "audioset"
|
38 |
class_label_indices: "audioset_eval_subset"
|
39 |
dataloader_add_ons: []
|
40 |
augment_p : 0.0
|
|
|
49 |
- caption
|
50 |
- best_model_w_meta_pred_caption
|
51 |
- gt_audio_caption
|
52 |
+
- autocap_caption
|
53 |
- wavcaps_caption
|
54 |
tags:
|
55 |
- keywords
|
|
|
90 |
params:
|
91 |
# dataset token
|
92 |
dataset_embed_dim: 32
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
|
94 |
# logging
|
95 |
validate_uncond: False
|
|
|
204 |
|
205 |
# The type of positional encodings to use for the time input
|
206 |
time_pe_type: learned
|
|
|
|
|
|
|
|
|
207 |
# Uses a label that specifies the id of the dataset from which the current input comes
|
208 |
use_dataset_id_conditioning: True
|
209 |
# Uses a label that specifies the resolution of the current input
|
210 |
use_resolution_conditioning: False
|
|
|
|
|
211 |
|
212 |
# Size of the input in pixels
|
213 |
input_size: [1, *latent_t_size, *latent_f_size] # (frames_count, height, widht)
|
|
|
332 |
wavcaps_soundbible: 6
|
333 |
fsd50k: 7
|
334 |
caption_audioset: 8
|
335 |
+
autocap: 9
|
336 |
unconditional: 0 # set the uncondtional to 0 for future experiments
|
337 |
|
338 |
|