sinarashidi commited on
Commit
2a71bf4
1 Parent(s): 45b814a

Update hyperparams.yaml

Browse files
Files changed (1) hide show
  1. hyperparams.yaml +100 -0
hyperparams.yaml CHANGED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pretrained_path: sinarashidi/temp
2
+
3
+ sample_rate: 16000
4
+
5
+ # URL for the HuggingFace model we want to load as encoder
6
+ wav2vec2_hub: m3hrdadfi/wav2vec2-large-xlsr-persian-v3
7
+
8
+ # Outputs
9
+ vocab_size: 100
10
+ blank_index: 99
11
+ bos_index: 97
12
+ eos_index: 98
13
+ pad_index: 99
14
+ label_smoothing: 0.0
15
+
16
+ # Encoder
17
+ features_dim: 1024
18
+
19
+ # Length Regulator
20
+ enc_kernel_size: 3
21
+ enc_stride: 2
22
+
23
+ # Transformer decoder
24
+ embedding_size: 512
25
+ d_model: 512
26
+ nhead: 8
27
+ num_encoder_layers: 0
28
+ num_decoder_layers: 6
29
+ d_ffn: 2048
30
+ transformer_dropout: 0.1
31
+ activation: !name:torch.nn.GELU
32
+ output_neurons: !ref <vocab_size>
33
+ attention_type: "RelPosMHAXL"
34
+
35
+ # Decoding parameters
36
+ min_decode_ratio: 0.0
37
+ max_decode_ratio: 1.0
38
+
39
+ wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2
40
+ source: !ref <wav2vec2_hub>
41
+ output_norm: True
42
+ freeze: True
43
+ freeze_feature_extractor: True
44
+ apply_spec_augment : True
45
+ save_path: wav2vec2_checkpoints
46
+
47
+ length_regulator: !new:speechbrain.nnet.CNN.Conv1d
48
+ input_shape: [null, null, !ref <features_dim>]
49
+ out_channels: !ref <embedding_size>
50
+ kernel_size: !ref <enc_kernel_size>
51
+ stride: !ref <enc_stride>
52
+
53
+ transformer_decoder: !new:speechbrain.lobes.models.transformer.TransformerST.TransformerST # yamllint disable-line rule:line-length
54
+ input_size: !ref <embedding_size>
55
+ tgt_vocab: !ref <output_neurons>
56
+ d_model: !ref <d_model>
57
+ nhead: !ref <nhead>
58
+ num_encoder_layers: !ref <num_encoder_layers>
59
+ num_decoder_layers: !ref <num_decoder_layers>
60
+ d_ffn: !ref <d_ffn>
61
+ dropout: !ref <transformer_dropout>
62
+ activation: !ref <activation>
63
+ attention_type: !ref <attention_type>
64
+ normalize_before: True
65
+ causal: False
66
+
67
+ log_softmax: !new:speechbrain.nnet.activations.Softmax
68
+ apply_log: True
69
+
70
+ seq_lin: !new:speechbrain.nnet.linear.Linear
71
+ input_size: !ref <d_model>
72
+ n_neurons: !ref <output_neurons>
73
+
74
+ model: !new:torch.nn.ModuleList
75
+ - [!ref <length_regulator>, !ref <transformer_decoder>, !ref <seq_lin>]
76
+
77
+ encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
78
+ wav2vec2: !ref <wav2vec2>
79
+ length_regulator: !ref <length_regulator>
80
+
81
+ decoder_beamsearch: !new:speechbrain.decoders.seq2seq.S2STransformerBeamSearcher
82
+ modules: [!ref <transformer_decoder>, !ref <seq_lin>]
83
+ bos_index: !ref <bos_index>
84
+ eos_index: !ref <eos_index>
85
+ min_decode_ratio: !ref <min_decode_ratio>
86
+ max_decode_ratio: !ref <max_decode_ratio>
87
+ beam_size: 10
88
+ temperature: 1.0
89
+
90
+ modules:
91
+ encoder: !ref <encoder>
92
+ decoder: !ref <decoder_beamsearch>
93
+
94
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
95
+ loadables:
96
+ model: !ref <model>
97
+ wav2vec2: !ref <wav2vec2>
98
+ paths:
99
+ wav2vec2: !ref <pretrained_path>/wav2vec2.ckpt
100
+ model: !ref <pretrained_path>/model.ckpt