File size: 5,233 Bytes
9f748f8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 |
train_config:
expdir: YOUR_DATA_ROOT_DIR/audio-flamingo-data/checkpoint
run_name: foundation_pretrain
delete_previous_checkpoint: true
batch_size: 6
gradient_accumulation_steps: 8 # global batchsize = 384
seed: 42
learning_rate: 0.0001
lr_scheduler: constant
loss_multiplier: 1.0
warmup_steps: 1875
weight_decay: 0.1
precision: amp_bf16
gradient_checkpointing: False
num_epochs: 100 # num_epochs * dataset_blending_global_weight = 1
offline: false
freeze_lm_embeddings: true
logging_steps: 10
dist_backend: nccl
dist_url: env://
no_set_device_rank: false
fsdp: true
fsdp_use_orig_params: false # Passed into the FSDP constructor. Enables param_groups and gradient masking for weight_decay. Does not work with OPT.
fsdp_sharding_strategy: full # full, hybrid
horovod: false
data_config:
dataset_blending_global_weight: 0.01
dataset_blending_config:
# Audio QA
OpenAQA-AQA/train:
weight: 1.0
prefix_prob: 0.0
augmentations:
do_nothing: 1.0
# Audio Captioning
BBCSoundEffects-AudioDescription/train:
weight: 5.0
prefix_prob: 0.5
augmentations:
do_nothing: 1.0
CLAP_freesound-AudioCaptioning/train:
weight: 1.0
prefix_prob: 0.5
augmentations:
do_nothing: 1.0
SoundDescs-AudioDescription/train:
weight: 1.0
prefix_prob: 0.5
augmentations:
do_nothing: 1.0
WavCaps-AudioSet_SL-AudioCaptioning/train:
weight: 1.0
prefix_prob: 0.5
augmentations:
do_nothing: 1.0
WavCaps-BBC_Sound_Effects-AudioCaptioning/train:
weight: 2.0
prefix_prob: 0.5
augmentations:
do_nothing: 1.0
WavCaps-FreeSound-AudioCaptioning/train:
weight: 2.0
prefix_prob: 0.5
augmentations:
do_nothing: 1.0
WavCaps-SoundBible-AudioCaptioning/train:
weight: 5.0
prefix_prob: 0.5
augmentations:
do_nothing: 1.0
# Audio Classification
AudioSetFullwoAudioMusicCaps-EventClassification/train:
weight: 1.0
prefix_prob: 0.5
augmentations:
num_words: 0.8
do_nothing: 0.2
WavText5K-Tagging/train:
weight: 3.0
prefix_prob: 0.5
augmentations:
num_words: 0.8
do_nothing: 0.2
# Speech Emotion Classification
MSP-PODCAST-Publish-1.9-EmotionClassification/train:
weight: 1.2
prefix_prob: 0.5
augmentations:
provide_all_labels: 0.9
do_nothing: 0.1
MELD-EmotionClassification/train:
weight: 1.2
prefix_prob: 0.5
augmentations:
provide_all_labels: 0.9
do_nothing: 0.1
MELD-SentimentClassification/train:
weight: 1.2
prefix_prob: 0.5
augmentations:
provide_all_labels: 0.9
do_nothing: 0.1
# Music QA
Music-AVQA-AVQA_All/train:
weight: 3.0
prefix_prob: 0.5
augmentations:
AQA_binary_instruction: 1.0
MU-LLAMA-AQA/train:
weight: 1.2
prefix_prob: 0.5
augmentations:
do_nothing: 1.0
# Music Captioning
LP-MusicCaps-MSD-AudioCaptioning/train:
weight: 1.0
prefix_prob: 0.5
augmentations:
do_nothing: 1.0
# Music Understanding
NSynth-MIR/train:
weight: 0.4
prefix_prob: 0.5
augmentations:
do_nothing: 1.0
mtg-jamendo-MusicTagging/train:
weight: 1.0
prefix_prob: 0.5
augmentations:
do_nothing: 1.0
dataset_file_root: YOUR_DATA_ROOT_DIR/audio-flamingo-data/dataset_files
data_root: YOUR_DATA_ROOT_DIR/datasets
dataset_blending_output: dataset_blending.json
max_tokens: 512
num_workers: 4
valid_dataset_config:
CLAP_freesound-AudioCaptioning/test: true
SoundDescs-AudioDescription/val: true
Clotho-AQA-EventClassification/val: true
MSP-PODCAST-Publish-1.9-EmotionClassification/val: true
MELD-EmotionClassification/val: true
MELD-SentimentClassification/val: true
MU-LLAMA-AQA/test: true
LP-MusicCaps-MSD-AudioCaptioning/val: true
NSynth-MIR/val: true
mtg-jamendo-MusicTagging/val: true
clap_config:
# method: laion-clap
# audio_embed_dim: 512
# model_name: 630k-fusion-best
# checkpoint: YOUR_DATA_ROOT_DIR/audio-flamingo-data/laion-clap-pretrained/laion_clap/630k-fusion-best.pt
method: microsoft-clap
audio_embed_dim: 1024
config_root: YOUR_REPO_ROOT_DIR/foundation/my_ms_clap/src/configs
model_name: 'clapcap'
checkpoint: YOUR_DATA_ROOT_DIR/audio-flamingo-data/clap/clapcap_weights_2023.pth
window_length: 7.0 # seconds
window_overlap: 5.25 # seconds
max_num_window: 16 # total = 33.25 seconds
max_num_fewshot: 8 # number of fewshot samples
model_config:
cache_dir: YOUR_DATA_ROOT_DIR/audio-flamingo-data/LLM_pretrained/.cache
lang_encoder_path: facebook/opt-iml-max-1.3b
tokenizer_path: facebook/opt-iml-max-1.3b
cross_attn_every_n_layers: 1
audio_transformer_kwargs: {
n_head: 8,
n_layers: 3,
d_inner: 2048,
max_num_media: 128, # must >= max_num_window * num_fewshot_samples
max_window_per_audio: 16, # must = max_num_window
} |