Spaces:
Running
on
Zero
Running
on
Zero
## Config file | |
# Log | |
seed: 777 | |
use_cuda: 1 # 1 for True, 0 for False | |
# dataset | |
speaker_no: 2 | |
mix_lst_path: ./data/allData/voxceleb2/mixture_data_list_2mix_pretrain.csv | |
audio_direc: /mnt/nas_sg/wulanchabu/zexu.pan/datasets/voxceleb2/audio_clean | |
reference_direc: /mnt/nas_sg/wulanchabu/zexu.pan/datasets/ # not used | |
audio_sr: 16000 | |
ref_sr: 25 | |
# dataloader | |
num_workers: 4 | |
batch_size: 2 # 4-GPU training with a total effective batch size of 8 | |
accu_grad: 0 | |
effec_batch_size: 4 # per GPU, only used if accu_grad is set to 1, must be multiple times of batch size | |
max_length: 5 # truncate the utterances in dataloader, in seconds | |
# network settings | |
init_from: checkpoints/log_2024-09-30(09:49:14) # 'None' or a log name 'log_2024-07-22(18:12:13)' | |
causal: 0 # 1 for True, 0 for False | |
network_reference: | |
cue: lip # lip or speech or gesture or EEG | |
backbone: resnet18 # resnet18 or shufflenetV2 or blazenet64 | |
emb_size: 256 # resnet18:256 | |
network_audio: | |
backbone: mossformer2 | |
encoder_kernel_size: 16 | |
encoder_out_nchannels: 512 | |
encoder_in_nchannels: 1 | |
masknet_numspks: 1 | |
masknet_chunksize: 250 | |
masknet_numlayers: 1 | |
masknet_norm: "ln" | |
masknet_useextralinearlayer: False | |
masknet_extraskipconnection: True | |
intra_numlayers: 24 | |
intra_nhead: 8 | |
intra_dffn: 1024 | |
intra_dropout: 0 | |
intra_use_positional: True | |
intra_norm_before: True | |
# optimizer | |
loss_type: hybrid # "snr", "sisdr", "hybrid" | |
init_learning_rate: 0.00015 | |
max_epoch: 150 | |
clip_grad_norm: 5 | |