mali6 commited on
Commit
b4acacf
·
verified ·
1 Parent(s): a91bd29

Upload autocap-full.yaml with huggingface_hub

Browse files
Files changed (1) hide show
  1. autocap-full.yaml +184 -0
autocap-full.yaml ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ target: !module src.models.pl_htsat_q_bart_captioning.AutoCap
2
+
3
+ variables:
4
+ num_workers: &num_workers 90
5
+ sampling_rate: &sampling_rate 32000
6
+ warmup_epochs: &warmup_epochs 2
7
+ lr: &lr 1.0e-5
8
+ batch_size: &bs 128
9
+
10
+ training:
11
+ seed: 20
12
+ pretrain: True
13
+ pretrain_path: "PRETAINED_CHECKPOINT"
14
+ resume_training: False # if true, the most recent checkpoint will be found in the log folder and used to initalize the training
15
+ precision: "high"
16
+ nodes_count: -1 # if -1, train on the whole world size. For multinode training, please lunch the module with torch.distributed.run
17
+ device: "cuda"
18
+ exclude_metrics: ['spice', 'meteor', 'spider']
19
+
20
+ logging:
21
+ project_name: "autocap"
22
+ wandb_key: YOUR_WANDB_KEY (check wandb.ai/authorize)
23
+ log_directory: "./run_logs/autocap/train"
24
+
25
+ # (optional) if s3 path is speicified, checkpoints be saved at S3_FOLDED/log_directory and deleted from the local folder (except the last checkpoint). Otherwise, checkpointwill be save locally indefinitely
26
+ # S3_BUCKET: "YOUR_S3_BUCKET"
27
+ # S3_FOLDER: 'YOUR_S3_FOLDER'
28
+ save_checkpoint_every_n_epochs: 5
29
+ save_top_k: -1
30
+
31
+ step:
32
+ epochs: 20
33
+ validation_every_n_epochs: 1
34
+ num_sanity_val_steps: 1
35
+
36
+ # debug
37
+ # limit_train_batches: 20
38
+ # limit_val_batches: 2
39
+
40
+
41
+ model:
42
+ clip_grad: 2
43
+ audio_features_dropout_p: 0.5
44
+ text_features_dropout_p: 0.5
45
+ use_text_qformer: false # if not, then append the the text tokens are directly fed to the decoder
46
+ use_audio_qformer: true # if not, then the audio features are directly fed to the decoder
47
+ use_clap_embeds: true
48
+ meta_input: true
49
+ add_special_tokens: True # If not then the meat data will start with Title:, Caption:, etc
50
+ meta_keys: ['video_caption', 'title']
51
+ # meta_keys: ['video_caption', 'videollama_caption', 'title', 'description', 'subtitle', 'labels']
52
+
53
+
54
+ meta:
55
+ max_prompt_len : 128
56
+
57
+ clap_embeds:
58
+ model: 'HTSAT-base'
59
+ ckpt: 'pretrained_models/clap/music_speech_audioset_epoch_15_esc_89.98.pt'
60
+ embed_dim: 512
61
+
62
+ text_qformer:
63
+ num_text_query_token: 64 # output tokens
64
+ input_audio2tex_query_embed : true
65
+ detach_video_query_embed: false
66
+ frozen_text_Qformer: false
67
+ hidden_size: 128
68
+ add_cross_attention: true
69
+ num_attention_heads: 8
70
+ num_hidden_layers: 2
71
+
72
+ audio_qformer:
73
+ num_audio_query_token: 256
74
+ frozen_audio_Qformer: false
75
+ hidden_size: 256
76
+ add_cross_attention: true
77
+ num_attention_heads: 8
78
+ num_hidden_layers: 2
79
+
80
+ tokenizer:
81
+ max_length: 30
82
+ special_tokens: ['<HQVC>', '</HQVC>', '<AVC>', '</AVC>', '<TITLE>', '</TITLE>', '<DESC>', '</DESC>', '<SUB>', '</SUB>', '<LBL>', '</LBL>']
83
+
84
+ audio_args:
85
+ sr: 32000
86
+ n_fft: 1024
87
+ hop_length: 320
88
+ f_min: 50
89
+ f_max: 14000
90
+ n_mels: 64
91
+ max_length: 10 # set to 10 for HTSAT encoder, and set to 0 or 30 for CNN encoder
92
+ mono: True
93
+
94
+ # audiocaps: audiocaps_gt_captions
95
+ # audioset: no caption, labels are available
96
+ # 'wavcaps_audioset_strong', 'wavcaps_bbcsound', 'wavcaps_freesound', 'wavcaps_soundbible' :wavcaps_caption
97
+ # clotho: gt_captions
98
+ # fs50k: no caption, labels are available
99
+ data_args:
100
+ data:
101
+ metadata_root: "../dataset_preperation/data/metadata/dataset_root.json"
102
+ train: ['32k_captioned_audiocaps', 'caption_audioset', 'wavcaps_audioset_strong', 'wavcaps_bbcsound', 'wavcaps_freesound', 'wavcaps_soundbible', 'clotho', 'fsd50k']
103
+ val: ['32k_captioned_audiocaps']
104
+ test: ['32k_captioned_audiocaps']
105
+
106
+ keys_synonyms:
107
+ gt_audio_caption:
108
+ - audiocaps_gt_captions
109
+ - gt_captions
110
+ - gt_caption
111
+ - caption
112
+ - gt_audio_caption
113
+ - wavcaps_caption
114
+ tags:
115
+ - keywords
116
+ - tags
117
+ - labels
118
+
119
+ batch_size: *bs
120
+ num_workers: *num_workers
121
+ augmentation_p : 0.1
122
+
123
+ preprocessing:
124
+ video:
125
+ fps : 1
126
+ height: 224
127
+ width: 224
128
+ audio:
129
+ sampling_rate: *sampling_rate
130
+ max_wav_value: 32768.0
131
+ duration: 10.0
132
+ stft:
133
+ filter_length: 1024
134
+ hop_length: 320
135
+ win_length: 1024
136
+ mel:
137
+ n_mel_channels: 64
138
+ mel_fmin: 50
139
+ mel_fmax: 14000
140
+
141
+
142
+ audio_encoder_args:
143
+ model_arch: "transformer"
144
+ model_name: "htsat"
145
+ pretrained: True
146
+ freeze: True
147
+ spec_augment: True
148
+
149
+ text_decoder_args:
150
+ model_tag: "audio_qformer"
151
+ name: "facebook/bart-base"
152
+ pretrained: true
153
+ freeze: False
154
+ freeze_embed_layer: False
155
+ bert_args:
156
+ attention_probs_dropout_prob: 0.2
157
+ hidden_act: "gelu"
158
+ hidden_dropout_prob: 0.2
159
+ hidden_size: 768
160
+ initializer_range: 0.02
161
+ intermediate_size: 2048
162
+ layer_norm_eps: !!float 1e-5
163
+ max_position_embeddings: 128
164
+ model_type: "bert"
165
+ num_attention_heads: 4
166
+ num_hidden_layers: 2
167
+ add_type_embeddings: false
168
+ vocab_size: 30522
169
+ add_cross_attention: true
170
+ is_decoder: true
171
+ num_labels: 0
172
+ name: "bert-base-uncased"
173
+
174
+
175
+ optim_args:
176
+ scheduler: cosine
177
+ lr: *lr
178
+ optimizer_name: "adam"
179
+ betas: [0.9, 0.999]
180
+ eps: !!float 1e-8
181
+ momentum: 0.9
182
+ gamma: 0.05
183
+ warmup_epochs: *warmup_epochs
184
+ weight_decay: !!float 1e-6