mali6
/

autocap

Model card Files Files and versions Community

autocap / autocap-full.yaml

mali6

Upload autocap-full.yaml with huggingface_hub

b4acacf verified 7 months ago

raw

history blame

4.95 kB

	target: !module src.models.pl_htsat_q_bart_captioning.AutoCap

	variables:
	num_workers: &num_workers 90
	sampling_rate: &sampling_rate 32000
	warmup_epochs: &warmup_epochs 2
	lr: &lr 1.0e-5
	batch_size: &bs 128

	training:
	seed: 20
	pretrain: True
	pretrain_path: "PRETAINED_CHECKPOINT"
	resume_training: False # if true, the most recent checkpoint will be found in the log folder and used to initalize the training
	precision: "high"
	nodes_count: -1 # if -1, train on the whole world size. For multinode training, please lunch the module with torch.distributed.run
	device: "cuda"
	exclude_metrics: ['spice', 'meteor', 'spider']

	logging:
	project_name: "autocap"
	wandb_key: YOUR_WANDB_KEY (check wandb.ai/authorize)
	log_directory: "./run_logs/autocap/train"

	# (optional) if s3 path is speicified, checkpoints be saved at S3_FOLDED/log_directory and deleted from the local folder (except the last checkpoint). Otherwise, checkpointwill be save locally indefinitely
	# S3_BUCKET: "YOUR_S3_BUCKET"
	# S3_FOLDER: 'YOUR_S3_FOLDER'
	save_checkpoint_every_n_epochs: 5
	save_top_k: -1

	step:
	epochs: 20
	validation_every_n_epochs: 1
	num_sanity_val_steps: 1

	# debug
	# limit_train_batches: 20
	# limit_val_batches: 2


	model:
	clip_grad: 2
	audio_features_dropout_p: 0.5
	text_features_dropout_p: 0.5
	use_text_qformer: false # if not, then append the the text tokens are directly fed to the decoder
	use_audio_qformer: true # if not, then the audio features are directly fed to the decoder
	use_clap_embeds: true
	meta_input: true
	add_special_tokens: True # If not then the meat data will start with Title:, Caption:, etc
	meta_keys: ['video_caption', 'title']
	# meta_keys: ['video_caption', 'videollama_caption', 'title', 'description', 'subtitle', 'labels']


	meta:
	max_prompt_len : 128

	clap_embeds:
	model: 'HTSAT-base'
	ckpt: 'pretrained_models/clap/music_speech_audioset_epoch_15_esc_89.98.pt'
	embed_dim: 512

	text_qformer:
	num_text_query_token: 64 # output tokens
	input_audio2tex_query_embed : true
	detach_video_query_embed: false
	frozen_text_Qformer: false
	hidden_size: 128
	add_cross_attention: true
	num_attention_heads: 8
	num_hidden_layers: 2

	audio_qformer:
	num_audio_query_token: 256
	frozen_audio_Qformer: false
	hidden_size: 256
	add_cross_attention: true
	num_attention_heads: 8
	num_hidden_layers: 2

	tokenizer:
	max_length: 30
	special_tokens: ['<HQVC>', '</HQVC>', '<AVC>', '</AVC>', '<TITLE>', '</TITLE>', '<DESC>', '</DESC>', '<SUB>', '</SUB>', '<LBL>', '</LBL>']

	audio_args:
	sr: 32000
	n_fft: 1024
	hop_length: 320
	f_min: 50
	f_max: 14000
	n_mels: 64
	max_length: 10 # set to 10 for HTSAT encoder, and set to 0 or 30 for CNN encoder
	mono: True

	# audiocaps: audiocaps_gt_captions
	# audioset: no caption, labels are available
	# 'wavcaps_audioset_strong', 'wavcaps_bbcsound', 'wavcaps_freesound', 'wavcaps_soundbible' :wavcaps_caption
	# clotho: gt_captions
	# fs50k: no caption, labels are available
	data_args:
	data:
	metadata_root: "../dataset_preperation/data/metadata/dataset_root.json"
	train: ['32k_captioned_audiocaps', 'caption_audioset', 'wavcaps_audioset_strong', 'wavcaps_bbcsound', 'wavcaps_freesound', 'wavcaps_soundbible', 'clotho', 'fsd50k']
	val: ['32k_captioned_audiocaps']
	test: ['32k_captioned_audiocaps']

	keys_synonyms:
	gt_audio_caption:
	- audiocaps_gt_captions
	- gt_captions
	- gt_caption
	- caption
	- gt_audio_caption
	- wavcaps_caption
	tags:
	- keywords
	- tags
	- labels

	batch_size: *bs
	num_workers: *num_workers
	augmentation_p : 0.1

	preprocessing:
	video:
	fps : 1
	height: 224
	width: 224
	audio:
	sampling_rate: *sampling_rate
	max_wav_value: 32768.0
	duration: 10.0
	stft:
	filter_length: 1024
	hop_length: 320
	win_length: 1024
	mel:
	n_mel_channels: 64
	mel_fmin: 50
	mel_fmax: 14000


	audio_encoder_args:
	model_arch: "transformer"
	model_name: "htsat"
	pretrained: True
	freeze: True
	spec_augment: True

	text_decoder_args:
	model_tag: "audio_qformer"
	name: "facebook/bart-base"
	pretrained: true
	freeze: False
	freeze_embed_layer: False
	bert_args:
	attention_probs_dropout_prob: 0.2
	hidden_act: "gelu"
	hidden_dropout_prob: 0.2
	hidden_size: 768
	initializer_range: 0.02
	intermediate_size: 2048
	layer_norm_eps: !!float 1e-5
	max_position_embeddings: 128
	model_type: "bert"
	num_attention_heads: 4
	num_hidden_layers: 2
	add_type_embeddings: false
	vocab_size: 30522
	add_cross_attention: true
	is_decoder: true
	num_labels: 0
	name: "bert-base-uncased"


	optim_args:
	scheduler: cosine
	lr: *lr
	optimizer_name: "adam"
	betas: [0.9, 0.999]
	eps: !!float 1e-8
	momentum: 0.9
	gamma: 0.05
	warmup_epochs: *warmup_epochs
	weight_decay: !!float 1e-6