|
target: !module src.models.pl_htsat_q_bart_captioning.AutoCap |
|
|
|
variables: |
|
num_workers: &num_workers 90 |
|
sampling_rate: &sampling_rate 32000 |
|
warmup_epochs: &warmup_epochs 2 |
|
lr: &lr 1.0e-5 |
|
batch_size: &bs 128 |
|
|
|
training: |
|
seed: 20 |
|
pretrain: True |
|
pretrain_path: "PRETAINED_CHECKPOINT" |
|
resume_training: False |
|
precision: "high" |
|
nodes_count: -1 |
|
device: "cuda" |
|
exclude_metrics: ['spice', 'meteor', 'spider'] |
|
|
|
logging: |
|
project_name: "autocap" |
|
wandb_key: YOUR_WANDB_KEY (check wandb.ai/authorize) |
|
log_directory: "./run_logs/autocap/train" |
|
|
|
|
|
|
|
|
|
save_checkpoint_every_n_epochs: 5 |
|
save_top_k: -1 |
|
|
|
step: |
|
epochs: 20 |
|
validation_every_n_epochs: 1 |
|
num_sanity_val_steps: 1 |
|
|
|
|
|
|
|
|
|
|
|
|
|
model: |
|
clip_grad: 2 |
|
audio_features_dropout_p: 0.5 |
|
text_features_dropout_p: 0.5 |
|
use_text_qformer: false |
|
use_audio_qformer: true |
|
use_clap_embeds: true |
|
meta_input: true |
|
add_special_tokens: True |
|
meta_keys: ['video_caption', 'title'] |
|
|
|
|
|
|
|
meta: |
|
max_prompt_len : 128 |
|
|
|
clap_embeds: |
|
model: 'HTSAT-base' |
|
ckpt: 'pretrained_models/clap/music_speech_audioset_epoch_15_esc_89.98.pt' |
|
embed_dim: 512 |
|
|
|
text_qformer: |
|
num_text_query_token: 64 |
|
input_audio2tex_query_embed : true |
|
detach_video_query_embed: false |
|
frozen_text_Qformer: false |
|
hidden_size: 128 |
|
add_cross_attention: true |
|
num_attention_heads: 8 |
|
num_hidden_layers: 2 |
|
|
|
audio_qformer: |
|
num_audio_query_token: 256 |
|
frozen_audio_Qformer: false |
|
hidden_size: 256 |
|
add_cross_attention: true |
|
num_attention_heads: 8 |
|
num_hidden_layers: 2 |
|
|
|
tokenizer: |
|
max_length: 30 |
|
special_tokens: ['<HQVC>', '</HQVC>', '<AVC>', '</AVC>', '<TITLE>', '</TITLE>', '<DESC>', '</DESC>', '<SUB>', '</SUB>', '<LBL>', '</LBL>'] |
|
|
|
audio_args: |
|
sr: 32000 |
|
n_fft: 1024 |
|
hop_length: 320 |
|
f_min: 50 |
|
f_max: 14000 |
|
n_mels: 64 |
|
max_length: 10 |
|
mono: True |
|
|
|
|
|
|
|
|
|
|
|
|
|
data_args: |
|
data: |
|
metadata_root: "../dataset_preperation/data/metadata/dataset_root.json" |
|
train: ['32k_captioned_audiocaps', 'caption_audioset', 'wavcaps_audioset_strong', 'wavcaps_bbcsound', 'wavcaps_freesound', 'wavcaps_soundbible', 'clotho', 'fsd50k'] |
|
val: ['32k_captioned_audiocaps'] |
|
test: ['32k_captioned_audiocaps'] |
|
|
|
keys_synonyms: |
|
gt_audio_caption: |
|
- audiocaps_gt_captions |
|
- gt_captions |
|
- gt_caption |
|
- caption |
|
- gt_audio_caption |
|
- wavcaps_caption |
|
tags: |
|
- keywords |
|
- tags |
|
- labels |
|
|
|
batch_size: *bs |
|
num_workers: *num_workers |
|
augmentation_p : 0.1 |
|
|
|
preprocessing: |
|
video: |
|
fps : 1 |
|
height: 224 |
|
width: 224 |
|
audio: |
|
sampling_rate: *sampling_rate |
|
max_wav_value: 32768.0 |
|
duration: 10.0 |
|
stft: |
|
filter_length: 1024 |
|
hop_length: 320 |
|
win_length: 1024 |
|
mel: |
|
n_mel_channels: 64 |
|
mel_fmin: 50 |
|
mel_fmax: 14000 |
|
|
|
|
|
audio_encoder_args: |
|
model_arch: "transformer" |
|
model_name: "htsat" |
|
pretrained: True |
|
freeze: True |
|
spec_augment: True |
|
|
|
text_decoder_args: |
|
model_tag: "audio_qformer" |
|
name: "facebook/bart-base" |
|
pretrained: true |
|
freeze: False |
|
freeze_embed_layer: False |
|
bert_args: |
|
attention_probs_dropout_prob: 0.2 |
|
hidden_act: "gelu" |
|
hidden_dropout_prob: 0.2 |
|
hidden_size: 768 |
|
initializer_range: 0.02 |
|
intermediate_size: 2048 |
|
layer_norm_eps: !!float 1e-5 |
|
max_position_embeddings: 128 |
|
model_type: "bert" |
|
num_attention_heads: 4 |
|
num_hidden_layers: 2 |
|
add_type_embeddings: false |
|
vocab_size: 30522 |
|
add_cross_attention: true |
|
is_decoder: true |
|
num_labels: 0 |
|
name: "bert-base-uncased" |
|
|
|
|
|
optim_args: |
|
scheduler: cosine |
|
lr: *lr |
|
optimizer_name: "adam" |
|
betas: [0.9, 0.999] |
|
eps: !!float 1e-8 |
|
momentum: 0.9 |
|
gamma: 0.05 |
|
warmup_epochs: *warmup_epochs |
|
weight_decay: !!float 1e-6 |
|
|