Upload autocap-full.yaml with huggingface_hub
Browse files- autocap-full.yaml +184 -0
autocap-full.yaml
ADDED
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
target: !module src.models.pl_htsat_q_bart_captioning.AutoCap
|
2 |
+
|
3 |
+
variables:
|
4 |
+
num_workers: &num_workers 90
|
5 |
+
sampling_rate: &sampling_rate 32000
|
6 |
+
warmup_epochs: &warmup_epochs 2
|
7 |
+
lr: &lr 1.0e-5
|
8 |
+
batch_size: &bs 128
|
9 |
+
|
10 |
+
training:
|
11 |
+
seed: 20
|
12 |
+
pretrain: True
|
13 |
+
pretrain_path: "PRETAINED_CHECKPOINT"
|
14 |
+
resume_training: False # if true, the most recent checkpoint will be found in the log folder and used to initalize the training
|
15 |
+
precision: "high"
|
16 |
+
nodes_count: -1 # if -1, train on the whole world size. For multinode training, please lunch the module with torch.distributed.run
|
17 |
+
device: "cuda"
|
18 |
+
exclude_metrics: ['spice', 'meteor', 'spider']
|
19 |
+
|
20 |
+
logging:
|
21 |
+
project_name: "autocap"
|
22 |
+
wandb_key: YOUR_WANDB_KEY (check wandb.ai/authorize)
|
23 |
+
log_directory: "./run_logs/autocap/train"
|
24 |
+
|
25 |
+
# (optional) if s3 path is speicified, checkpoints be saved at S3_FOLDED/log_directory and deleted from the local folder (except the last checkpoint). Otherwise, checkpointwill be save locally indefinitely
|
26 |
+
# S3_BUCKET: "YOUR_S3_BUCKET"
|
27 |
+
# S3_FOLDER: 'YOUR_S3_FOLDER'
|
28 |
+
save_checkpoint_every_n_epochs: 5
|
29 |
+
save_top_k: -1
|
30 |
+
|
31 |
+
step:
|
32 |
+
epochs: 20
|
33 |
+
validation_every_n_epochs: 1
|
34 |
+
num_sanity_val_steps: 1
|
35 |
+
|
36 |
+
# debug
|
37 |
+
# limit_train_batches: 20
|
38 |
+
# limit_val_batches: 2
|
39 |
+
|
40 |
+
|
41 |
+
model:
|
42 |
+
clip_grad: 2
|
43 |
+
audio_features_dropout_p: 0.5
|
44 |
+
text_features_dropout_p: 0.5
|
45 |
+
use_text_qformer: false # if not, then append the the text tokens are directly fed to the decoder
|
46 |
+
use_audio_qformer: true # if not, then the audio features are directly fed to the decoder
|
47 |
+
use_clap_embeds: true
|
48 |
+
meta_input: true
|
49 |
+
add_special_tokens: True # If not then the meat data will start with Title:, Caption:, etc
|
50 |
+
meta_keys: ['video_caption', 'title']
|
51 |
+
# meta_keys: ['video_caption', 'videollama_caption', 'title', 'description', 'subtitle', 'labels']
|
52 |
+
|
53 |
+
|
54 |
+
meta:
|
55 |
+
max_prompt_len : 128
|
56 |
+
|
57 |
+
clap_embeds:
|
58 |
+
model: 'HTSAT-base'
|
59 |
+
ckpt: 'pretrained_models/clap/music_speech_audioset_epoch_15_esc_89.98.pt'
|
60 |
+
embed_dim: 512
|
61 |
+
|
62 |
+
text_qformer:
|
63 |
+
num_text_query_token: 64 # output tokens
|
64 |
+
input_audio2tex_query_embed : true
|
65 |
+
detach_video_query_embed: false
|
66 |
+
frozen_text_Qformer: false
|
67 |
+
hidden_size: 128
|
68 |
+
add_cross_attention: true
|
69 |
+
num_attention_heads: 8
|
70 |
+
num_hidden_layers: 2
|
71 |
+
|
72 |
+
audio_qformer:
|
73 |
+
num_audio_query_token: 256
|
74 |
+
frozen_audio_Qformer: false
|
75 |
+
hidden_size: 256
|
76 |
+
add_cross_attention: true
|
77 |
+
num_attention_heads: 8
|
78 |
+
num_hidden_layers: 2
|
79 |
+
|
80 |
+
tokenizer:
|
81 |
+
max_length: 30
|
82 |
+
special_tokens: ['<HQVC>', '</HQVC>', '<AVC>', '</AVC>', '<TITLE>', '</TITLE>', '<DESC>', '</DESC>', '<SUB>', '</SUB>', '<LBL>', '</LBL>']
|
83 |
+
|
84 |
+
audio_args:
|
85 |
+
sr: 32000
|
86 |
+
n_fft: 1024
|
87 |
+
hop_length: 320
|
88 |
+
f_min: 50
|
89 |
+
f_max: 14000
|
90 |
+
n_mels: 64
|
91 |
+
max_length: 10 # set to 10 for HTSAT encoder, and set to 0 or 30 for CNN encoder
|
92 |
+
mono: True
|
93 |
+
|
94 |
+
# audiocaps: audiocaps_gt_captions
|
95 |
+
# audioset: no caption, labels are available
|
96 |
+
# 'wavcaps_audioset_strong', 'wavcaps_bbcsound', 'wavcaps_freesound', 'wavcaps_soundbible' :wavcaps_caption
|
97 |
+
# clotho: gt_captions
|
98 |
+
# fs50k: no caption, labels are available
|
99 |
+
data_args:
|
100 |
+
data:
|
101 |
+
metadata_root: "../dataset_preperation/data/metadata/dataset_root.json"
|
102 |
+
train: ['32k_captioned_audiocaps', 'caption_audioset', 'wavcaps_audioset_strong', 'wavcaps_bbcsound', 'wavcaps_freesound', 'wavcaps_soundbible', 'clotho', 'fsd50k']
|
103 |
+
val: ['32k_captioned_audiocaps']
|
104 |
+
test: ['32k_captioned_audiocaps']
|
105 |
+
|
106 |
+
keys_synonyms:
|
107 |
+
gt_audio_caption:
|
108 |
+
- audiocaps_gt_captions
|
109 |
+
- gt_captions
|
110 |
+
- gt_caption
|
111 |
+
- caption
|
112 |
+
- gt_audio_caption
|
113 |
+
- wavcaps_caption
|
114 |
+
tags:
|
115 |
+
- keywords
|
116 |
+
- tags
|
117 |
+
- labels
|
118 |
+
|
119 |
+
batch_size: *bs
|
120 |
+
num_workers: *num_workers
|
121 |
+
augmentation_p : 0.1
|
122 |
+
|
123 |
+
preprocessing:
|
124 |
+
video:
|
125 |
+
fps : 1
|
126 |
+
height: 224
|
127 |
+
width: 224
|
128 |
+
audio:
|
129 |
+
sampling_rate: *sampling_rate
|
130 |
+
max_wav_value: 32768.0
|
131 |
+
duration: 10.0
|
132 |
+
stft:
|
133 |
+
filter_length: 1024
|
134 |
+
hop_length: 320
|
135 |
+
win_length: 1024
|
136 |
+
mel:
|
137 |
+
n_mel_channels: 64
|
138 |
+
mel_fmin: 50
|
139 |
+
mel_fmax: 14000
|
140 |
+
|
141 |
+
|
142 |
+
audio_encoder_args:
|
143 |
+
model_arch: "transformer"
|
144 |
+
model_name: "htsat"
|
145 |
+
pretrained: True
|
146 |
+
freeze: True
|
147 |
+
spec_augment: True
|
148 |
+
|
149 |
+
text_decoder_args:
|
150 |
+
model_tag: "audio_qformer"
|
151 |
+
name: "facebook/bart-base"
|
152 |
+
pretrained: true
|
153 |
+
freeze: False
|
154 |
+
freeze_embed_layer: False
|
155 |
+
bert_args:
|
156 |
+
attention_probs_dropout_prob: 0.2
|
157 |
+
hidden_act: "gelu"
|
158 |
+
hidden_dropout_prob: 0.2
|
159 |
+
hidden_size: 768
|
160 |
+
initializer_range: 0.02
|
161 |
+
intermediate_size: 2048
|
162 |
+
layer_norm_eps: !!float 1e-5
|
163 |
+
max_position_embeddings: 128
|
164 |
+
model_type: "bert"
|
165 |
+
num_attention_heads: 4
|
166 |
+
num_hidden_layers: 2
|
167 |
+
add_type_embeddings: false
|
168 |
+
vocab_size: 30522
|
169 |
+
add_cross_attention: true
|
170 |
+
is_decoder: true
|
171 |
+
num_labels: 0
|
172 |
+
name: "bert-base-uncased"
|
173 |
+
|
174 |
+
|
175 |
+
optim_args:
|
176 |
+
scheduler: cosine
|
177 |
+
lr: *lr
|
178 |
+
optimizer_name: "adam"
|
179 |
+
betas: [0.9, 0.999]
|
180 |
+
eps: !!float 1e-8
|
181 |
+
momentum: 0.9
|
182 |
+
gamma: 0.05
|
183 |
+
warmup_epochs: *warmup_epochs
|
184 |
+
weight_decay: !!float 1e-6
|