|
train_config: |
|
expdir: YOUR_DATA_ROOT_DIR/audio-flamingo-data/checkpoint |
|
run_name: foundation_pretrain |
|
delete_previous_checkpoint: true |
|
batch_size: 6 |
|
gradient_accumulation_steps: 8 |
|
seed: 42 |
|
learning_rate: 0.0001 |
|
lr_scheduler: constant |
|
loss_multiplier: 1.0 |
|
warmup_steps: 1875 |
|
weight_decay: 0.1 |
|
precision: amp_bf16 |
|
gradient_checkpointing: False |
|
num_epochs: 100 |
|
offline: false |
|
freeze_lm_embeddings: true |
|
logging_steps: 10 |
|
dist_backend: nccl |
|
dist_url: env:// |
|
no_set_device_rank: false |
|
fsdp: true |
|
fsdp_use_orig_params: false |
|
fsdp_sharding_strategy: full |
|
horovod: false |
|
|
|
data_config: |
|
dataset_blending_global_weight: 0.01 |
|
|
|
dataset_blending_config: |
|
|
|
|
|
|
|
OpenAQA-AQA/train: |
|
weight: 1.0 |
|
prefix_prob: 0.0 |
|
augmentations: |
|
do_nothing: 1.0 |
|
|
|
|
|
|
|
BBCSoundEffects-AudioDescription/train: |
|
weight: 5.0 |
|
prefix_prob: 0.5 |
|
augmentations: |
|
do_nothing: 1.0 |
|
|
|
CLAP_freesound-AudioCaptioning/train: |
|
weight: 1.0 |
|
prefix_prob: 0.5 |
|
augmentations: |
|
do_nothing: 1.0 |
|
|
|
SoundDescs-AudioDescription/train: |
|
weight: 1.0 |
|
prefix_prob: 0.5 |
|
augmentations: |
|
do_nothing: 1.0 |
|
|
|
WavCaps-AudioSet_SL-AudioCaptioning/train: |
|
weight: 1.0 |
|
prefix_prob: 0.5 |
|
augmentations: |
|
do_nothing: 1.0 |
|
|
|
WavCaps-BBC_Sound_Effects-AudioCaptioning/train: |
|
weight: 2.0 |
|
prefix_prob: 0.5 |
|
augmentations: |
|
do_nothing: 1.0 |
|
|
|
WavCaps-FreeSound-AudioCaptioning/train: |
|
weight: 2.0 |
|
prefix_prob: 0.5 |
|
augmentations: |
|
do_nothing: 1.0 |
|
|
|
WavCaps-SoundBible-AudioCaptioning/train: |
|
weight: 5.0 |
|
prefix_prob: 0.5 |
|
augmentations: |
|
do_nothing: 1.0 |
|
|
|
|
|
|
|
AudioSetFullwoAudioMusicCaps-EventClassification/train: |
|
weight: 1.0 |
|
prefix_prob: 0.5 |
|
augmentations: |
|
num_words: 0.8 |
|
do_nothing: 0.2 |
|
|
|
WavText5K-Tagging/train: |
|
weight: 3.0 |
|
prefix_prob: 0.5 |
|
augmentations: |
|
num_words: 0.8 |
|
do_nothing: 0.2 |
|
|
|
|
|
|
|
MSP-PODCAST-Publish-1.9-EmotionClassification/train: |
|
weight: 1.2 |
|
prefix_prob: 0.5 |
|
augmentations: |
|
provide_all_labels: 0.9 |
|
do_nothing: 0.1 |
|
|
|
MELD-EmotionClassification/train: |
|
weight: 1.2 |
|
prefix_prob: 0.5 |
|
augmentations: |
|
provide_all_labels: 0.9 |
|
do_nothing: 0.1 |
|
|
|
MELD-SentimentClassification/train: |
|
weight: 1.2 |
|
prefix_prob: 0.5 |
|
augmentations: |
|
provide_all_labels: 0.9 |
|
do_nothing: 0.1 |
|
|
|
|
|
|
|
Music-AVQA-AVQA_All/train: |
|
weight: 3.0 |
|
prefix_prob: 0.5 |
|
augmentations: |
|
AQA_binary_instruction: 1.0 |
|
|
|
MU-LLAMA-AQA/train: |
|
weight: 1.2 |
|
prefix_prob: 0.5 |
|
augmentations: |
|
do_nothing: 1.0 |
|
|
|
|
|
|
|
LP-MusicCaps-MSD-AudioCaptioning/train: |
|
weight: 1.0 |
|
prefix_prob: 0.5 |
|
augmentations: |
|
do_nothing: 1.0 |
|
|
|
|
|
|
|
NSynth-MIR/train: |
|
weight: 0.4 |
|
prefix_prob: 0.5 |
|
augmentations: |
|
do_nothing: 1.0 |
|
|
|
mtg-jamendo-MusicTagging/train: |
|
weight: 1.0 |
|
prefix_prob: 0.5 |
|
augmentations: |
|
do_nothing: 1.0 |
|
|
|
dataset_file_root: YOUR_DATA_ROOT_DIR/audio-flamingo-data/dataset_files |
|
data_root: YOUR_DATA_ROOT_DIR/datasets |
|
dataset_blending_output: dataset_blending.json |
|
max_tokens: 512 |
|
num_workers: 4 |
|
|
|
valid_dataset_config: |
|
CLAP_freesound-AudioCaptioning/test: true |
|
SoundDescs-AudioDescription/val: true |
|
Clotho-AQA-EventClassification/val: true |
|
MSP-PODCAST-Publish-1.9-EmotionClassification/val: true |
|
MELD-EmotionClassification/val: true |
|
MELD-SentimentClassification/val: true |
|
MU-LLAMA-AQA/test: true |
|
LP-MusicCaps-MSD-AudioCaptioning/val: true |
|
NSynth-MIR/val: true |
|
mtg-jamendo-MusicTagging/val: true |
|
|
|
clap_config: |
|
|
|
|
|
|
|
|
|
|
|
method: microsoft-clap |
|
audio_embed_dim: 1024 |
|
config_root: YOUR_REPO_ROOT_DIR/foundation/my_ms_clap/src/configs |
|
model_name: 'clapcap' |
|
checkpoint: YOUR_DATA_ROOT_DIR/audio-flamingo-data/clap/clapcap_weights_2023.pth |
|
|
|
window_length: 7.0 |
|
window_overlap: 5.25 |
|
max_num_window: 16 |
|
max_num_fewshot: 8 |
|
|
|
model_config: |
|
cache_dir: YOUR_DATA_ROOT_DIR/audio-flamingo-data/LLM_pretrained/.cache |
|
|
|
lang_encoder_path: facebook/opt-iml-max-1.3b |
|
tokenizer_path: facebook/opt-iml-max-1.3b |
|
cross_attn_every_n_layers: 1 |
|
audio_transformer_kwargs: { |
|
n_head: 8, |
|
n_layers: 3, |
|
d_inner: 2048, |
|
max_num_media: 128, |
|
max_window_per_audio: 16, |
|
} |