# ############################################################################ # Model: ECAPA big for Speaker verification # ############################################################################ # Feature parameters n_mels: 80 # Pretrain folder (HuggingFace) # pretrained_path: poonehmousavi/discrete_wavlm_spk_rec_ecapatdn pretrained_path: benchmarks/DASB/VoiceCeleb1/speaker_ver/temp # Output parameters out_n_neurons: 1211 save_folder: tmp ### Configuration for discrete SSL model # ssl_model_type: hubert, wavlm, wav2vec2 # ssl_hub: facebook/hubert-large-ll60k, microsoft/wavlm-large, facebook/wav2vec2-large ssl_model_type: wavlm # hubert, wavml or wav2vec2 ssl_hub: microsoft/wavlm-large ssl_folder: !ref /ssl_checkpoint kmeans_repo_id: speechbrain/SSL_Quantization kmeans_cache_dir: !ref /kmeans_checkpoint kmeans_dataset: LibriSpeech-100-360-500 freeze_ssl: True freeze_feature_extractor: True num_clusters: 1000 ### Config for Tokenizer # Layer number should be among the supported layers for discrete SSL models(kmenas model should be available for that layer) # ssl_layer_num: [3, 7, 12, 23] # deduplicate: [False, False, False, False] # bpe_tokenizer_path: [null , null, null, null] ssl_layer_num: [1, 3, 7, 12, 18, 23] num_codebooks: 6 deduplicate: [False, False, False, False, False, False] bpe_tokenizer_path: [null, null, null, null, null, null] sample_rate: 16000 # Feature parameters encoder_dim: 1024 # Modules tokenizer_config: SSL_layers: !ref deduplicates: !ref bpe_tokenizers: !ref ssl_model: !apply:speechbrain.utils.hparams.choice value: !ref choices: wavlm: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM source: !ref output_norm: False freeze: !ref freeze_feature_extractor: !ref output_all_hiddens: True save_path: !ref hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT source: !ref output_norm: False freeze: !ref freeze_feature_extractor: !ref output_all_hiddens: True save_path: !ref wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2 source: !ref output_norm: False freeze: !ref freeze_feature_extractor: !ref output_all_hiddens: True save_path: !ref codec: !new:speechbrain.lobes.models.huggingface_transformers.discrete_ssl.DiscreteSSL save_path: !ref ssl_model: !ref kmeans_dataset: !ref kmeans_repo_id: !ref num_clusters: !ref discrete_embedding_layer: !new:custom_model.Discrete_EmbeddingLayer num_codebooks: !ref vocab_size: !ref emb_dim: !ref attention_mlp: !new:custom_model.AttentionMLP input_dim: !ref hidden_dim: !ref embedding_model: !new:speechbrain.lobes.models.ECAPA_TDNN.ECAPA_TDNN input_size: !ref channels: [1024, 1024, 1024, 1024, 3072] kernel_sizes: [5, 3, 3, 3, 1] dilations: [1, 2, 3, 4, 1] groups: [1, 1, 1, 1, 1] attention_channels: 128 lin_neurons: 192 classifier: !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier input_size: 192 out_neurons: !ref modules: embedding_model: !ref classifier: !ref attention_mlp: !ref codec: !ref discrete_embedding_layer: !ref label_encoder: !new:speechbrain.dataio.encoder.CategoricalEncoder pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer loadables: embedding_model: !ref classifier: !ref attention_mlp: !ref discrete_embedding_layer: !ref label_encoder: !ref paths: embedding_model: !ref /embedding_model.ckpt classifier: !ref /classifier.ckpt attention_mlp: !ref /attention_mlp.ckpt label_encoder: !ref /label_encoder.txt discrete_embedding_layer: !ref /discrete_embedding_layer.ckpt