nguyenvulebinh commited on
Commit
6a93ac9
1 Parent(s): 2ec5c73

Upload model

Browse files
Files changed (3) hide show
  1. config.json +239 -413
  2. generation_config.json +13 -0
  3. pytorch_model.bin +2 -2
config.json CHANGED
@@ -1,472 +1,298 @@
1
  {
2
- "_name_or_path": "/export/data1/data/binhnguyen/workspaces/robustspeech/model-bin/asr/wavlm-large_bart-base-deepfilter0.3/avg_augment",
3
  "architectures": [
4
- "SEASRModel"
5
  ],
6
- "asr_config": {
7
- "_commit_hash": null,
8
- "_name_or_path": "",
9
- "add_cross_attention": false,
10
- "architectures": null,
 
 
 
 
 
 
11
  "bad_words_ids": null,
12
  "begin_suppress_tokens": null,
13
- "bos_token_id": null,
14
  "chunk_size_feed_forward": 0,
 
 
15
  "cross_attention_hidden_size": null,
16
- "decoder": {
17
- "_name_or_path": "facebook/bart-base",
18
- "activation_dropout": 0.1,
19
- "activation_function": "gelu",
20
- "add_bias_logits": false,
21
- "add_cross_attention": true,
22
- "add_final_layer_norm": false,
23
- "architectures": [
24
- "BartModel"
25
- ],
26
- "attention_dropout": 0.1,
27
- "bad_words_ids": null,
28
- "begin_suppress_tokens": null,
29
- "bos_token_id": 0,
30
- "chunk_size_feed_forward": 0,
31
- "classif_dropout": 0.1,
32
- "classifier_dropout": 0.0,
33
- "cross_attention_hidden_size": null,
34
- "d_model": 768,
35
- "decoder_attention_heads": 12,
36
- "decoder_ffn_dim": 3072,
37
- "decoder_layerdrop": 0.0,
38
- "decoder_layers": 6,
39
- "decoder_start_token_id": 2,
40
- "diversity_penalty": 0.0,
41
- "do_sample": false,
42
- "dropout": 0.1,
43
- "early_stopping": true,
44
- "encoder_attention_heads": 12,
45
- "encoder_ffn_dim": 3072,
46
- "encoder_layerdrop": 0.0,
47
- "encoder_layers": 6,
48
- "encoder_no_repeat_ngram_size": 0,
49
- "eos_token_id": 2,
50
- "exponential_decay_length_penalty": null,
51
- "finetuning_task": null,
52
- "forced_bos_token_id": 0,
53
- "forced_eos_token_id": 2,
54
- "gradient_checkpointing": false,
55
- "id2label": {
56
- "0": "LABEL_0",
57
- "1": "LABEL_1",
58
- "2": "LABEL_2"
59
- },
60
- "init_std": 0.02,
61
- "is_decoder": true,
62
- "is_encoder_decoder": true,
63
- "label2id": {
64
- "LABEL_0": 0,
65
- "LABEL_1": 1,
66
- "LABEL_2": 2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  },
68
- "length_penalty": 1.0,
69
- "max_length": 20,
70
- "max_position_embeddings": 1024,
71
- "min_length": 0,
72
- "model_type": "bart",
73
- "no_repeat_ngram_size": 3,
74
- "normalize_before": false,
75
- "normalize_embedding": true,
76
- "num_beam_groups": 1,
77
- "num_beams": 4,
78
- "num_hidden_layers": 6,
79
- "num_return_sequences": 1,
80
- "output_attentions": false,
81
- "output_hidden_states": false,
82
- "output_scores": false,
83
- "pad_token_id": 1,
84
- "prefix": null,
85
- "problem_type": null,
86
- "pruned_heads": {},
87
- "remove_invalid_values": false,
88
- "repetition_penalty": 1.0,
89
- "return_dict": true,
90
- "return_dict_in_generate": false,
91
- "scale_embedding": false,
92
- "sep_token_id": null,
93
- "suppress_tokens": null,
94
- "task_specific_params": {
95
- "summarization": {
96
- "length_penalty": 1.0,
97
- "max_length": 128,
98
- "min_length": 12,
99
- "num_beams": 4
100
- },
101
- "summarization_cnn": {
102
- "length_penalty": 2.0,
103
- "max_length": 142,
104
- "min_length": 56,
105
- "num_beams": 4
106
- },
107
- "summarization_xsum": {
108
- "length_penalty": 1.0,
109
- "max_length": 62,
110
- "min_length": 11,
111
- "num_beams": 6
112
- }
113
  },
114
- "temperature": 1.0,
115
- "tf_legacy_loss": false,
116
- "tie_encoder_decoder": false,
117
- "tie_word_embeddings": true,
118
- "tokenizer_class": null,
119
- "top_k": 50,
120
- "top_p": 1.0,
121
- "torch_dtype": "float32",
122
- "torchscript": false,
123
- "transformers_version": "4.28.1",
124
- "typical_p": 1.0,
125
- "use_bfloat16": false,
126
- "use_cache": true,
127
- "vocab_size": 50265
128
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  "decoder_start_token_id": null,
 
130
  "diversity_penalty": 0.0,
131
  "do_sample": false,
 
132
  "early_stopping": false,
133
- "encoder": {
134
- "_name_or_path": "microsoft/wavlm-large",
135
- "activation_dropout": 0.0,
136
- "adapter_kernel_size": 3,
137
- "adapter_stride": 2,
138
- "add_adapter": true,
139
- "add_cross_attention": false,
140
- "apply_spec_augment": true,
141
- "architectures": [
142
- "WavLMModel"
143
- ],
144
- "attention_dropout": 0.1,
145
- "bad_words_ids": null,
146
- "begin_suppress_tokens": null,
147
- "bos_token_id": 1,
148
- "chunk_size_feed_forward": 0,
149
- "classifier_proj_size": 256,
150
- "codevector_dim": 768,
151
- "contrastive_logits_temperature": 0.1,
152
- "conv_bias": false,
153
- "conv_dim": [
154
- 512,
155
- 512,
156
- 512,
157
- 512,
158
- 512,
159
- 512,
160
- 512
161
- ],
162
- "conv_kernel": [
163
- 10,
164
- 3,
165
- 3,
166
- 3,
167
- 3,
168
- 2,
169
- 2
170
- ],
171
- "conv_stride": [
172
- 5,
173
- 2,
174
- 2,
175
- 2,
176
- 2,
177
- 2,
178
- 2
179
- ],
180
- "cross_attention_hidden_size": null,
181
- "ctc_loss_reduction": "sum",
182
- "ctc_zero_infinity": false,
183
- "decoder_start_token_id": null,
184
- "diversity_loss_weight": 0.1,
185
- "diversity_penalty": 0.0,
186
- "do_sample": false,
187
- "do_stable_layer_norm": true,
188
- "early_stopping": false,
189
- "encoder_no_repeat_ngram_size": 0,
190
- "eos_token_id": 2,
191
- "exponential_decay_length_penalty": null,
192
- "feat_extract_activation": "gelu",
193
- "feat_extract_dropout": 0.0,
194
- "feat_extract_norm": "layer",
195
- "feat_proj_dropout": 0.1,
196
- "feat_quantizer_dropout": 0.0,
197
- "final_dropout": 0.0,
198
- "finetuning_task": null,
199
- "forced_bos_token_id": null,
200
- "forced_eos_token_id": null,
201
- "gradient_checkpointing": false,
202
- "hidden_act": "gelu",
203
- "hidden_dropout": 0.1,
204
- "hidden_size": 1024,
205
- "id2label": {
206
- "0": "LABEL_0",
207
- "1": "LABEL_1"
208
- },
209
- "initializer_range": 0.02,
210
- "intermediate_size": 4096,
211
- "is_decoder": false,
212
- "is_encoder_decoder": false,
213
- "label2id": {
214
- "LABEL_0": 0,
215
- "LABEL_1": 1
216
- },
217
- "layer_norm_eps": 1e-05,
218
- "layerdrop": 0.1,
219
- "length_penalty": 1.0,
220
- "mask_channel_length": 10,
221
- "mask_channel_min_space": 1,
222
- "mask_channel_other": 0.0,
223
- "mask_channel_prob": 0.0,
224
- "mask_channel_selection": "static",
225
- "mask_feature_length": 10,
226
- "mask_feature_min_masks": 0,
227
- "mask_feature_prob": 0.0,
228
- "mask_time_length": 10,
229
- "mask_time_min_masks": 2,
230
- "mask_time_min_space": 1,
231
- "mask_time_other": 0.0,
232
- "mask_time_prob": 0.075,
233
- "mask_time_selection": "static",
234
- "max_bucket_distance": 800,
235
- "max_length": 20,
236
- "min_length": 0,
237
- "model_type": "wavlm",
238
- "no_repeat_ngram_size": 0,
239
- "num_adapter_layers": 3,
240
- "num_attention_heads": 16,
241
- "num_beam_groups": 1,
242
- "num_beams": 1,
243
- "num_buckets": 320,
244
- "num_codevector_groups": 2,
245
- "num_codevectors_per_group": 320,
246
- "num_conv_pos_embedding_groups": 16,
247
- "num_conv_pos_embeddings": 128,
248
- "num_ctc_classes": 80,
249
- "num_feat_extract_layers": 7,
250
- "num_hidden_layers": 24,
251
- "num_negatives": 100,
252
- "num_return_sequences": 1,
253
- "output_attentions": false,
254
- "output_hidden_size": 1024,
255
- "output_hidden_states": false,
256
- "output_scores": false,
257
- "pad_token_id": 0,
258
- "prefix": null,
259
- "problem_type": null,
260
- "proj_codevector_dim": 768,
261
- "pruned_heads": {},
262
- "remove_invalid_values": false,
263
- "repetition_penalty": 1.0,
264
- "replace_prob": 0.5,
265
- "return_dict": true,
266
- "return_dict_in_generate": false,
267
- "sep_token_id": null,
268
- "suppress_tokens": null,
269
- "task_specific_params": null,
270
- "tdnn_dilation": [
271
- 1,
272
- 2,
273
- 3,
274
- 1,
275
- 1
276
- ],
277
- "tdnn_dim": [
278
- 512,
279
- 512,
280
- 512,
281
- 512,
282
- 1500
283
- ],
284
- "tdnn_kernel": [
285
- 5,
286
- 3,
287
- 3,
288
- 1,
289
- 1
290
- ],
291
- "temperature": 1.0,
292
- "tf_legacy_loss": false,
293
- "tie_encoder_decoder": false,
294
- "tie_word_embeddings": true,
295
- "tokenizer_class": "Wav2Vec2CTCTokenizer",
296
- "top_k": 50,
297
- "top_p": 1.0,
298
- "torch_dtype": "float32",
299
- "torchscript": false,
300
- "transformers_version": "4.28.1",
301
- "typical_p": 1.0,
302
- "use_bfloat16": false,
303
- "use_weighted_layer_sum": false,
304
- "vocab_size": 32,
305
- "xvector_output_dim": 512
306
- },
307
  "encoder_no_repeat_ngram_size": 0,
308
- "eos_token_id": null,
309
  "exponential_decay_length_penalty": null,
 
 
 
 
 
 
310
  "finetuning_task": null,
311
  "forced_bos_token_id": null,
312
  "forced_eos_token_id": null,
 
 
 
 
313
  "id2label": {
314
  "0": "LABEL_0",
315
  "1": "LABEL_1"
316
  },
 
 
317
  "is_decoder": false,
318
- "is_encoder_decoder": true,
319
  "label2id": {
320
  "LABEL_0": 0,
321
  "LABEL_1": 1
322
  },
 
 
323
  "length_penalty": 1.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
324
  "max_length": 20,
325
  "min_length": 0,
326
- "model_type": "speech-encoder-decoder",
327
  "no_repeat_ngram_size": 0,
 
 
328
  "num_beam_groups": 1,
329
  "num_beams": 1,
 
 
 
 
 
 
 
 
 
330
  "num_return_sequences": 1,
331
  "output_attentions": false,
 
332
  "output_hidden_states": false,
333
  "output_scores": false,
334
- "pad_token_id": null,
335
  "prefix": null,
336
  "problem_type": null,
 
337
  "pruned_heads": {},
338
  "remove_invalid_values": false,
339
  "repetition_penalty": 1.0,
 
340
  "return_dict": true,
341
  "return_dict_in_generate": false,
342
  "sep_token_id": null,
343
  "suppress_tokens": null,
344
  "task_specific_params": null,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
345
  "temperature": 1.0,
346
  "tf_legacy_loss": false,
347
  "tie_encoder_decoder": false,
348
  "tie_word_embeddings": true,
349
- "tokenizer_class": null,
350
  "top_k": 50,
351
  "top_p": 1.0,
352
- "torch_dtype": null,
353
  "torchscript": false,
354
- "transformers_version": null,
355
  "typical_p": 1.0,
356
- "use_bfloat16": false
357
- },
358
- "model_type": "robustspeech",
359
- "speech_enhance_config": {
360
- "deepfilternet": {
361
- "conv_ch": "64",
362
- "conv_dec_mode": "transposed",
363
- "conv_depthwise": "True",
364
- "conv_kernel": "1,3",
365
- "conv_kernel_inp": "3,3",
366
- "conv_lookahead": "2",
367
- "convt_depthwise": "True",
368
- "df_gru_skip": "none",
369
- "df_hidden_dim": "256",
370
- "df_lookahead": "2",
371
- "df_n_iter": "1",
372
- "df_num_layers": "2",
373
- "df_order": "5",
374
- "df_output_layer": "groupedlinear",
375
- "df_pathway_kernel_size_t": "5",
376
- "dfop_method": "df",
377
- "emb_hidden_dim": "256",
378
- "emb_num_layers": "3",
379
- "enc_concat": "True",
380
- "group_shuffle": "False",
381
- "gru_groups": "8",
382
- "gru_type": "squeeze",
383
- "linear_groups": "8",
384
- "mask_pf": "False"
385
- },
386
- "df": {
387
- "fft_size": "960",
388
- "hop_size": "480",
389
- "lsnr_max": "35",
390
- "lsnr_min": "-15",
391
- "min_nb_erb_freqs": "2",
392
- "nb_df": "96",
393
- "nb_erb": "32",
394
- "norm_tau": "1",
395
- "pad_mode": "input_specf",
396
- "sr": "48000"
397
- },
398
- "dfalphaloss": {
399
- "factor": "0.0"
400
- },
401
- "localsnrloss": {
402
- "factor": "1e-3"
403
- },
404
- "maskloss": {
405
- "f_under": "1",
406
- "factor": "0",
407
- "gamma": "0.6",
408
- "gamma_pred": "0.6",
409
- "mask": "iam"
410
- },
411
- "multiresspecloss": {
412
- "factor": "500",
413
- "factor_complex": "500",
414
- "fft_sizes": "256,512,1024",
415
- "gamma": "0.3"
416
- },
417
- "optim": {
418
- "lr": "0.001",
419
- "lr_cycle_decay": "0.5",
420
- "lr_cycle_epochs": "-1",
421
- "lr_cycle_limit": "1",
422
- "lr_cycle_mul": "1.0",
423
- "lr_min": "1e-06",
424
- "lr_update_per_epoch": "False",
425
- "lr_warmup": "0.0001",
426
- "momentum": "0",
427
- "optimizer": "adamw",
428
- "warmup_epochs": "3",
429
- "weight_decay": "1e-12",
430
- "weight_decay_end": "0.05"
431
- },
432
- "sdrloss": {
433
- "factor": "0.0",
434
- "segmental_ws": "0"
435
- },
436
- "spectralloss": {
437
- "factor_complex": "1000",
438
- "factor_magnitude": "1000",
439
- "gamma": "0.3"
440
- },
441
- "train": {
442
- "batch_size": "96",
443
- "batch_size_eval": "128",
444
- "batch_size_scheduling": "0/8,1/16,2/24,5/32,10/64,20/128,40/9999",
445
- "dataloader_snrs": "-5,0,5,10,20,40",
446
- "detect_anomaly": "false",
447
- "device": "",
448
- "df_only": "false",
449
- "early_stopping_patience": "15",
450
- "global_ds_sampling_f": "1",
451
- "jit": "false",
452
- "log_freq": "100",
453
- "log_timings": "False",
454
- "mask_only": "false",
455
- "max_epochs": "100",
456
- "max_sample_len_s": "3.0",
457
- "model": "deepfilternet2",
458
- "num_prefetch_batches": "8",
459
- "num_workers": "16",
460
- "overfit": "false",
461
- "p_atten_lim": "0.0",
462
- "p_reverb": "0.1",
463
- "seed": "43",
464
- "start_eval": "true",
465
- "validation_criteria": "loss",
466
- "validation_criteria_rule": "min",
467
- "validation_set_caching": "false"
468
- }
469
  },
 
 
 
470
  "torch_dtype": "float32",
471
- "transformers_version": "4.28.1"
472
  }
 
1
  {
2
+ "_commit_hash": null,
3
  "architectures": [
4
+ "SpeechEncoderDecoderModel"
5
  ],
6
+ "decoder": {
7
+ "_name_or_path": "facebook/bart-base",
8
+ "activation_dropout": 0.1,
9
+ "activation_function": "gelu",
10
+ "add_bias_logits": false,
11
+ "add_cross_attention": true,
12
+ "add_final_layer_norm": false,
13
+ "architectures": [
14
+ "BartModel"
15
+ ],
16
+ "attention_dropout": 0.1,
17
  "bad_words_ids": null,
18
  "begin_suppress_tokens": null,
19
+ "bos_token_id": 0,
20
  "chunk_size_feed_forward": 0,
21
+ "classif_dropout": 0.1,
22
+ "classifier_dropout": 0.0,
23
  "cross_attention_hidden_size": null,
24
+ "d_model": 768,
25
+ "decoder_attention_heads": 12,
26
+ "decoder_ffn_dim": 3072,
27
+ "decoder_layerdrop": 0.0,
28
+ "decoder_layers": 6,
29
+ "decoder_start_token_id": 2,
30
+ "diversity_penalty": 0.0,
31
+ "do_sample": false,
32
+ "dropout": 0.1,
33
+ "early_stopping": true,
34
+ "encoder_attention_heads": 12,
35
+ "encoder_ffn_dim": 3072,
36
+ "encoder_layerdrop": 0.0,
37
+ "encoder_layers": 6,
38
+ "encoder_no_repeat_ngram_size": 0,
39
+ "eos_token_id": 2,
40
+ "exponential_decay_length_penalty": null,
41
+ "finetuning_task": null,
42
+ "forced_bos_token_id": 0,
43
+ "forced_eos_token_id": 2,
44
+ "gradient_checkpointing": false,
45
+ "id2label": {
46
+ "0": "LABEL_0",
47
+ "1": "LABEL_1",
48
+ "2": "LABEL_2"
49
+ },
50
+ "init_std": 0.02,
51
+ "is_decoder": true,
52
+ "is_encoder_decoder": true,
53
+ "label2id": {
54
+ "LABEL_0": 0,
55
+ "LABEL_1": 1,
56
+ "LABEL_2": 2
57
+ },
58
+ "length_penalty": 1.0,
59
+ "max_length": 20,
60
+ "max_position_embeddings": 1024,
61
+ "min_length": 0,
62
+ "model_type": "bart",
63
+ "no_repeat_ngram_size": 3,
64
+ "normalize_before": false,
65
+ "normalize_embedding": true,
66
+ "num_beam_groups": 1,
67
+ "num_beams": 4,
68
+ "num_hidden_layers": 6,
69
+ "num_return_sequences": 1,
70
+ "output_attentions": false,
71
+ "output_hidden_states": false,
72
+ "output_scores": false,
73
+ "pad_token_id": 1,
74
+ "prefix": null,
75
+ "problem_type": null,
76
+ "pruned_heads": {},
77
+ "remove_invalid_values": false,
78
+ "repetition_penalty": 1.0,
79
+ "return_dict": true,
80
+ "return_dict_in_generate": false,
81
+ "scale_embedding": false,
82
+ "sep_token_id": null,
83
+ "suppress_tokens": null,
84
+ "task_specific_params": {
85
+ "summarization": {
86
+ "length_penalty": 1.0,
87
+ "max_length": 128,
88
+ "min_length": 12,
89
+ "num_beams": 4
90
  },
91
+ "summarization_cnn": {
92
+ "length_penalty": 2.0,
93
+ "max_length": 142,
94
+ "min_length": 56,
95
+ "num_beams": 4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  },
97
+ "summarization_xsum": {
98
+ "length_penalty": 1.0,
99
+ "max_length": 62,
100
+ "min_length": 11,
101
+ "num_beams": 6
102
+ }
 
 
 
 
 
 
 
 
103
  },
104
+ "temperature": 1.0,
105
+ "tf_legacy_loss": false,
106
+ "tie_encoder_decoder": false,
107
+ "tie_word_embeddings": true,
108
+ "tokenizer_class": null,
109
+ "top_k": 50,
110
+ "top_p": 1.0,
111
+ "torch_dtype": "float32",
112
+ "torchscript": false,
113
+ "transformers_version": "4.30.2",
114
+ "typical_p": 1.0,
115
+ "use_bfloat16": false,
116
+ "use_cache": true,
117
+ "vocab_size": 51266
118
+ },
119
+ "encoder": {
120
+ "_name_or_path": "microsoft/wavlm-large",
121
+ "activation_dropout": 0.0,
122
+ "adapter_kernel_size": 3,
123
+ "adapter_stride": 2,
124
+ "add_adapter": true,
125
+ "add_cross_attention": false,
126
+ "apply_spec_augment": true,
127
+ "architectures": [
128
+ "WavLMModel"
129
+ ],
130
+ "attention_dropout": 0.1,
131
+ "bad_words_ids": null,
132
+ "begin_suppress_tokens": null,
133
+ "bos_token_id": 1,
134
+ "chunk_size_feed_forward": 0,
135
+ "classifier_proj_size": 256,
136
+ "codevector_dim": 768,
137
+ "contrastive_logits_temperature": 0.1,
138
+ "conv_bias": false,
139
+ "conv_dim": [
140
+ 512,
141
+ 512,
142
+ 512,
143
+ 512,
144
+ 512,
145
+ 512,
146
+ 512
147
+ ],
148
+ "conv_kernel": [
149
+ 10,
150
+ 3,
151
+ 3,
152
+ 3,
153
+ 3,
154
+ 2,
155
+ 2
156
+ ],
157
+ "conv_stride": [
158
+ 5,
159
+ 2,
160
+ 2,
161
+ 2,
162
+ 2,
163
+ 2,
164
+ 2
165
+ ],
166
+ "cross_attention_hidden_size": null,
167
+ "ctc_loss_reduction": "sum",
168
+ "ctc_zero_infinity": false,
169
  "decoder_start_token_id": null,
170
+ "diversity_loss_weight": 0.1,
171
  "diversity_penalty": 0.0,
172
  "do_sample": false,
173
+ "do_stable_layer_norm": true,
174
  "early_stopping": false,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  "encoder_no_repeat_ngram_size": 0,
176
+ "eos_token_id": 2,
177
  "exponential_decay_length_penalty": null,
178
+ "feat_extract_activation": "gelu",
179
+ "feat_extract_dropout": 0.0,
180
+ "feat_extract_norm": "layer",
181
+ "feat_proj_dropout": 0.1,
182
+ "feat_quantizer_dropout": 0.0,
183
+ "final_dropout": 0.0,
184
  "finetuning_task": null,
185
  "forced_bos_token_id": null,
186
  "forced_eos_token_id": null,
187
+ "gradient_checkpointing": false,
188
+ "hidden_act": "gelu",
189
+ "hidden_dropout": 0.1,
190
+ "hidden_size": 1024,
191
  "id2label": {
192
  "0": "LABEL_0",
193
  "1": "LABEL_1"
194
  },
195
+ "initializer_range": 0.02,
196
+ "intermediate_size": 4096,
197
  "is_decoder": false,
198
+ "is_encoder_decoder": false,
199
  "label2id": {
200
  "LABEL_0": 0,
201
  "LABEL_1": 1
202
  },
203
+ "layer_norm_eps": 1e-05,
204
+ "layerdrop": 0.1,
205
  "length_penalty": 1.0,
206
+ "mask_channel_length": 10,
207
+ "mask_channel_min_space": 1,
208
+ "mask_channel_other": 0.0,
209
+ "mask_channel_prob": 0.0,
210
+ "mask_channel_selection": "static",
211
+ "mask_feature_length": 10,
212
+ "mask_feature_min_masks": 0,
213
+ "mask_feature_prob": 0.0,
214
+ "mask_time_length": 10,
215
+ "mask_time_min_masks": 2,
216
+ "mask_time_min_space": 1,
217
+ "mask_time_other": 0.0,
218
+ "mask_time_prob": 0.075,
219
+ "mask_time_selection": "static",
220
+ "max_bucket_distance": 800,
221
  "max_length": 20,
222
  "min_length": 0,
223
+ "model_type": "wavlm",
224
  "no_repeat_ngram_size": 0,
225
+ "num_adapter_layers": 3,
226
+ "num_attention_heads": 16,
227
  "num_beam_groups": 1,
228
  "num_beams": 1,
229
+ "num_buckets": 320,
230
+ "num_codevector_groups": 2,
231
+ "num_codevectors_per_group": 320,
232
+ "num_conv_pos_embedding_groups": 16,
233
+ "num_conv_pos_embeddings": 128,
234
+ "num_ctc_classes": 80,
235
+ "num_feat_extract_layers": 7,
236
+ "num_hidden_layers": 24,
237
+ "num_negatives": 100,
238
  "num_return_sequences": 1,
239
  "output_attentions": false,
240
+ "output_hidden_size": 1024,
241
  "output_hidden_states": false,
242
  "output_scores": false,
243
+ "pad_token_id": 0,
244
  "prefix": null,
245
  "problem_type": null,
246
+ "proj_codevector_dim": 768,
247
  "pruned_heads": {},
248
  "remove_invalid_values": false,
249
  "repetition_penalty": 1.0,
250
+ "replace_prob": 0.5,
251
  "return_dict": true,
252
  "return_dict_in_generate": false,
253
  "sep_token_id": null,
254
  "suppress_tokens": null,
255
  "task_specific_params": null,
256
+ "tdnn_dilation": [
257
+ 1,
258
+ 2,
259
+ 3,
260
+ 1,
261
+ 1
262
+ ],
263
+ "tdnn_dim": [
264
+ 512,
265
+ 512,
266
+ 512,
267
+ 512,
268
+ 1500
269
+ ],
270
+ "tdnn_kernel": [
271
+ 5,
272
+ 3,
273
+ 3,
274
+ 1,
275
+ 1
276
+ ],
277
  "temperature": 1.0,
278
  "tf_legacy_loss": false,
279
  "tie_encoder_decoder": false,
280
  "tie_word_embeddings": true,
281
+ "tokenizer_class": "Wav2Vec2CTCTokenizer",
282
  "top_k": 50,
283
  "top_p": 1.0,
284
+ "torch_dtype": "float32",
285
  "torchscript": false,
286
+ "transformers_version": "4.30.2",
287
  "typical_p": 1.0,
288
+ "use_bfloat16": false,
289
+ "use_weighted_layer_sum": false,
290
+ "vocab_size": 32,
291
+ "xvector_output_dim": 512
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
  },
293
+ "is_encoder_decoder": true,
294
+ "model_type": "speech-encoder-decoder",
295
+ "tie_word_embeddings": false,
296
  "torch_dtype": "float32",
297
+ "transformers_version": null
298
  }
generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "decoder_start_token_id": 2,
5
+ "early_stopping": true,
6
+ "eos_token_id": 2,
7
+ "forced_bos_token_id": 0,
8
+ "forced_eos_token_id": 2,
9
+ "no_repeat_ngram_size": 3,
10
+ "num_beams": 4,
11
+ "pad_token_id": 1,
12
+ "transformers_version": "4.30.2"
13
+ }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:75dfb577ab113ab181d6fcc23d6ee841dea4583374974d670afb2e78d5f0afd3
3
- size 1888930213
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ffa3f1e422fd83a12159a9b0cf5696acbfc28dfb4e41007dfe2fc2cde75ad393
3
+ size 1885680785