Upload folder using huggingface_hub
Browse files- best/preprocessor_config.json +10 -0
- best/pytorch_model.bin +3 -0
- best/training_args.bin +3 -0
- config.pth +3 -0
- preprocessor_config.json +10 -0
- special_tokens_map.json +6 -0
- tokenizer_config.json +15 -0
- train.log +81 -0
- train_conf.json +32 -0
- vocab.json +44 -0
best/preprocessor_config.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"do_normalize": true,
|
3 |
+
"feature_extractor_type": "Wav2Vec2FeatureExtractor",
|
4 |
+
"feature_size": 1,
|
5 |
+
"padding_side": "right",
|
6 |
+
"padding_value": 0.0,
|
7 |
+
"processor_class": "Wav2Vec2Processor",
|
8 |
+
"return_attention_mask": true,
|
9 |
+
"sampling_rate": 16000
|
10 |
+
}
|
best/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:db747c73a5a0bcfa3a14bcee18d9fe4f6906d017f56d57c77cd9a17aded446d1
|
3 |
+
size 1266295149
|
best/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6a8c57214620f9ca91e802bd74ba3796439d92b8900367e6c767a8d141c124fd
|
3 |
+
size 3695
|
config.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3bc4625ed5a3bd5eb46b3a62a3a586bd3d984606cf3fdfa1192e0aee8b316e15
|
3 |
+
size 4591
|
preprocessor_config.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"do_normalize": true,
|
3 |
+
"feature_extractor_type": "Wav2Vec2FeatureExtractor",
|
4 |
+
"feature_size": 1,
|
5 |
+
"padding_side": "right",
|
6 |
+
"padding_value": 0.0,
|
7 |
+
"processor_class": "Wav2Vec2Processor",
|
8 |
+
"return_attention_mask": true,
|
9 |
+
"sampling_rate": 16000
|
10 |
+
}
|
special_tokens_map.json
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": "<s>",
|
3 |
+
"eos_token": "</s>",
|
4 |
+
"pad_token": "[PAD]",
|
5 |
+
"unk_token": "err"
|
6 |
+
}
|
tokenizer_config.json
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": "<s>",
|
3 |
+
"clean_up_tokenization_spaces": true,
|
4 |
+
"do_phonemize": false,
|
5 |
+
"eos_token": "</s>",
|
6 |
+
"model_max_length": 1000000000000000019884624838656,
|
7 |
+
"pad_token": "[PAD]",
|
8 |
+
"phone_delimiter_token": " ",
|
9 |
+
"phonemizer_backend": "espeak",
|
10 |
+
"phonemizer_lang": "en-us",
|
11 |
+
"processor_class": "Wav2Vec2Processor",
|
12 |
+
"tokenizer_class": "Wav2Vec2PhonemeCTCTokenizer",
|
13 |
+
"unk_token": "err",
|
14 |
+
"word_delimiter_token": null
|
15 |
+
}
|
train.log
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[INFO] Set manual seed 66
|
2 |
+
[NOTE] Model args ...
|
3 |
+
{
|
4 |
+
"model_path": "facebook/wav2vec2-large-lv60",
|
5 |
+
"problem_type": "single_label_classification",
|
6 |
+
"task_type": "asr",
|
7 |
+
"model_type": "baseline",
|
8 |
+
"final_dropout": 0.0,
|
9 |
+
"ctc_zero_infinity": true,
|
10 |
+
"layerdrop": 0.1,
|
11 |
+
"activation_dropout": 0.1,
|
12 |
+
"mask_time_prob": 0.065,
|
13 |
+
"mask_time_length": 10,
|
14 |
+
"mask_feature_prob": 0.015,
|
15 |
+
"mask_feature_length": 64
|
16 |
+
}
|
17 |
+
[INFO] Save tokenizer/extractor/processor to exp/timit/train_timit_baseline_wav2vec2_large_lv60_66_noworddel ...
|
18 |
+
[INFO] data-json/timit/train_dataset/dataset.arrow exists, using it
|
19 |
+
[INFO] data-json/timit/dev_dataset/dataset.arrow exists, using it
|
20 |
+
[INFO] Train a baseline model from facebook/wav2vec2-large-lv60 ...
|
21 |
+
{'loss': 452.0092, 'learning_rate': 7.166666666666667e-06, 'epoch': 0.86}
|
22 |
+
{'loss': 396.7542, 'learning_rate': 1.5333333333333334e-05, 'epoch': 1.72}
|
23 |
+
{'eval_loss': 356.89007568359375, 'eval_wer': 1.0, 'eval_cer': 1.0, 'eval_runtime': 11.6259, 'eval_samples_per_second': 34.406, 'eval_steps_per_second': 1.118, 'epoch': 1.72}
|
24 |
+
{'loss': 315.949, 'learning_rate': 2.3666666666666668e-05, 'epoch': 2.59}
|
25 |
+
{'loss': 168.6427, 'learning_rate': 3.2000000000000005e-05, 'epoch': 3.45}
|
26 |
+
{'eval_loss': 133.75314331054688, 'eval_wer': 1.0, 'eval_cer': 1.0, 'eval_runtime': 11.1446, 'eval_samples_per_second': 35.892, 'eval_steps_per_second': 1.166, 'epoch': 3.45}
|
27 |
+
{'loss': 129.0611, 'learning_rate': 4.0333333333333336e-05, 'epoch': 4.31}
|
28 |
+
{'loss': 126.4254, 'learning_rate': 4.866666666666667e-05, 'epoch': 5.17}
|
29 |
+
{'eval_loss': 123.59996032714844, 'eval_wer': 1.0, 'eval_cer': 1.0, 'eval_runtime': 11.1434, 'eval_samples_per_second': 35.896, 'eval_steps_per_second': 1.167, 'epoch': 5.17}
|
30 |
+
{'loss': 123.9036, 'learning_rate': 5.6999999999999996e-05, 'epoch': 6.03}
|
31 |
+
{'loss': 124.3407, 'learning_rate': 6.533333333333334e-05, 'epoch': 6.9}
|
32 |
+
{'eval_loss': 123.45381164550781, 'eval_wer': 1.0, 'eval_cer': 1.0, 'eval_runtime': 11.2845, 'eval_samples_per_second': 35.447, 'eval_steps_per_second': 1.152, 'epoch': 6.9}
|
33 |
+
{'loss': 122.7627, 'learning_rate': 7.366666666666668e-05, 'epoch': 7.76}
|
34 |
+
{'loss': 124.9659, 'learning_rate': 8.2e-05, 'epoch': 8.62}
|
35 |
+
{'eval_loss': 120.54447937011719, 'eval_wer': 1.0, 'eval_cer': 1.0, 'eval_runtime': 11.645, 'eval_samples_per_second': 34.35, 'eval_steps_per_second': 1.116, 'epoch': 8.62}
|
36 |
+
{'loss': 117.4764, 'learning_rate': 9.033333333333334e-05, 'epoch': 9.48}
|
37 |
+
{'loss': 122.2053, 'learning_rate': 9.866666666666668e-05, 'epoch': 10.34}
|
38 |
+
{'eval_loss': 111.49588012695312, 'eval_wer': 0.8151690243740453, 'eval_cer': 0.7344880456135308, 'eval_runtime': 11.1121, 'eval_samples_per_second': 35.997, 'eval_steps_per_second': 1.17, 'epoch': 10.34}
|
39 |
+
{'loss': 107.2907, 'learning_rate': 9.7e-05, 'epoch': 11.21}
|
40 |
+
{'loss': 91.5518, 'learning_rate': 9.342857142857143e-05, 'epoch': 12.07}
|
41 |
+
{'eval_loss': 71.25965118408203, 'eval_wer': 0.6252241482367006, 'eval_cer': 0.5373724306453931, 'eval_runtime': 11.7788, 'eval_samples_per_second': 33.959, 'eval_steps_per_second': 1.104, 'epoch': 12.07}
|
42 |
+
{'loss': 67.4171, 'learning_rate': 8.985714285714287e-05, 'epoch': 12.93}
|
43 |
+
{'loss': 50.1061, 'learning_rate': 8.62857142857143e-05, 'epoch': 13.79}
|
44 |
+
{'eval_loss': 27.590818405151367, 'eval_wer': 0.24048615262004383, 'eval_cer': 0.22212639547697763, 'eval_runtime': 11.6657, 'eval_samples_per_second': 34.289, 'eval_steps_per_second': 1.114, 'epoch': 13.79}
|
45 |
+
{'loss': 35.4837, 'learning_rate': 8.271428571428572e-05, 'epoch': 14.66}
|
46 |
+
{'loss': 26.2278, 'learning_rate': 7.914285714285715e-05, 'epoch': 15.52}
|
47 |
+
{'eval_loss': 14.451704025268555, 'eval_wer': 0.10035199574948529, 'eval_cer': 0.07608643572421063, 'eval_runtime': 11.7263, 'eval_samples_per_second': 34.111, 'eval_steps_per_second': 1.109, 'epoch': 15.52}
|
48 |
+
{'loss': 21.4687, 'learning_rate': 7.557142857142857e-05, 'epoch': 16.38}
|
49 |
+
{'loss': 19.5909, 'learning_rate': 7.2e-05, 'epoch': 17.24}
|
50 |
+
{'eval_loss': 11.897302627563477, 'eval_wer': 0.08839742312545659, 'eval_cer': 0.06741411527957453, 'eval_runtime': 11.3848, 'eval_samples_per_second': 35.135, 'eval_steps_per_second': 1.142, 'epoch': 17.24}
|
51 |
+
{'loss': 17.8596, 'learning_rate': 6.842857142857143e-05, 'epoch': 18.1}
|
52 |
+
{'loss': 16.5052, 'learning_rate': 6.485714285714286e-05, 'epoch': 18.97}
|
53 |
+
{'eval_loss': 9.827818870544434, 'eval_wer': 0.0839476655376237, 'eval_cer': 0.06537779694312683, 'eval_runtime': 11.4438, 'eval_samples_per_second': 34.953, 'eval_steps_per_second': 1.136, 'epoch': 18.97}
|
54 |
+
{'loss': 15.9378, 'learning_rate': 6.12857142857143e-05, 'epoch': 19.83}
|
55 |
+
{'loss': 13.871, 'learning_rate': 5.771428571428572e-05, 'epoch': 20.69}
|
56 |
+
{'eval_loss': 9.462098121643066, 'eval_wer': 0.0830178654446437, 'eval_cer': 0.0646830530401035, 'eval_runtime': 11.1352, 'eval_samples_per_second': 35.922, 'eval_steps_per_second': 1.167, 'epoch': 20.69}
|
57 |
+
{'loss': 15.1374, 'learning_rate': 5.414285714285715e-05, 'epoch': 21.55}
|
58 |
+
{'loss': 13.3166, 'learning_rate': 5.057142857142857e-05, 'epoch': 22.41}
|
59 |
+
{'eval_loss': 9.01073932647705, 'eval_wer': 0.08049412233512653, 'eval_cer': 0.06199990417325475, 'eval_runtime': 11.3574, 'eval_samples_per_second': 35.219, 'eval_steps_per_second': 1.145, 'epoch': 22.41}
|
60 |
+
{'loss': 13.1342, 'learning_rate': 4.7e-05, 'epoch': 23.28}
|
61 |
+
{'loss': 12.651, 'learning_rate': 4.342857142857143e-05, 'epoch': 24.14}
|
62 |
+
{'eval_loss': 8.882560729980469, 'eval_wer': 0.07856810785681079, 'eval_cer': 0.06020315269991855, 'eval_runtime': 11.1062, 'eval_samples_per_second': 36.016, 'eval_steps_per_second': 1.171, 'epoch': 24.14}
|
63 |
+
{'loss': 12.0675, 'learning_rate': 3.985714285714286e-05, 'epoch': 25.0}
|
64 |
+
{'loss': 12.8447, 'learning_rate': 3.628571428571429e-05, 'epoch': 25.86}
|
65 |
+
{'eval_loss': 8.821671485900879, 'eval_wer': 0.0781696221026765, 'eval_cer': 0.059101145129605674, 'eval_runtime': 11.5252, 'eval_samples_per_second': 34.707, 'eval_steps_per_second': 1.128, 'epoch': 25.86}
|
66 |
+
{'loss': 12.0886, 'learning_rate': 3.271428571428571e-05, 'epoch': 26.72}
|
67 |
+
{'loss': 13.3448, 'learning_rate': 2.9142857142857146e-05, 'epoch': 27.59}
|
68 |
+
{'eval_loss': 8.660683631896973, 'eval_wer': 0.0773062363020522, 'eval_cer': 0.058094964304537394, 'eval_runtime': 11.1216, 'eval_samples_per_second': 35.966, 'eval_steps_per_second': 1.169, 'epoch': 27.59}
|
69 |
+
{'loss': 11.6601, 'learning_rate': 2.5571428571428572e-05, 'epoch': 28.45}
|
70 |
+
{'loss': 11.2744, 'learning_rate': 2.2000000000000003e-05, 'epoch': 29.31}
|
71 |
+
{'eval_loss': 8.715682029724121, 'eval_wer': 0.07670850767085077, 'eval_cer': 0.057975180872981646, 'eval_runtime': 11.2314, 'eval_samples_per_second': 35.615, 'eval_steps_per_second': 1.157, 'epoch': 29.31}
|
72 |
+
{'loss': 10.8255, 'learning_rate': 1.842857142857143e-05, 'epoch': 30.17}
|
73 |
+
{'loss': 12.5791, 'learning_rate': 1.4857142857142858e-05, 'epoch': 31.03}
|
74 |
+
{'eval_loss': 8.562932014465332, 'eval_wer': 0.07617719333200505, 'eval_cer': 0.05694504336160222, 'eval_runtime': 11.5283, 'eval_samples_per_second': 34.697, 'eval_steps_per_second': 1.128, 'epoch': 31.03}
|
75 |
+
{'loss': 11.7516, 'learning_rate': 1.1285714285714285e-05, 'epoch': 31.9}
|
76 |
+
{'loss': 11.1128, 'learning_rate': 7.714285714285714e-06, 'epoch': 32.76}
|
77 |
+
{'eval_loss': 8.644214630126953, 'eval_wer': 0.0763764362090722, 'eval_cer': 0.05732835034258062, 'eval_runtime': 11.2033, 'eval_samples_per_second': 35.704, 'eval_steps_per_second': 1.16, 'epoch': 32.76}
|
78 |
+
{'loss': 11.2963, 'learning_rate': 4.142857142857143e-06, 'epoch': 33.62}
|
79 |
+
{'loss': 11.052, 'learning_rate': 5.714285714285715e-07, 'epoch': 34.48}
|
80 |
+
{'eval_loss': 8.613777160644531, 'eval_wer': 0.07571229328551504, 'eval_cer': 0.05687317330266878, 'eval_runtime': 11.142, 'eval_samples_per_second': 35.9, 'eval_steps_per_second': 1.167, 'epoch': 34.48}
|
81 |
+
{'train_runtime': 5691.097, 'train_samples_per_second': 22.491, 'train_steps_per_second': 0.351, 'train_loss': 75.09857940673828, 'epoch': 34.48}
|
train_conf.json
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"per_device_train_batch_size": 64,
|
4 |
+
"gradient_accumulation_steps": 1,
|
5 |
+
"per_device_eval_batch_size": 32,
|
6 |
+
"evaluation_strategy": "steps",
|
7 |
+
"max_steps": 2000,
|
8 |
+
"save_steps": 100,
|
9 |
+
"eval_steps": 100,
|
10 |
+
"logging_steps": 50,
|
11 |
+
"learning_rate": 0.0001,
|
12 |
+
"weight_decay": 0,
|
13 |
+
"warmup_steps": 600,
|
14 |
+
"save_total_limit": 1,
|
15 |
+
"metric_for_best_model": "wer",
|
16 |
+
"greater_is_better": false
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"model_path": "facebook/wav2vec2-large-lv60",
|
20 |
+
"problem_type": "single_label_classification",
|
21 |
+
"task_type": "asr",
|
22 |
+
"model_type": "baseline",
|
23 |
+
"final_dropout": 0.0,
|
24 |
+
"ctc_zero_infinity": true,
|
25 |
+
"layerdrop": 0.1,
|
26 |
+
"activation_dropout": 0.1,
|
27 |
+
"mask_time_prob": 0.065,
|
28 |
+
"mask_time_length": 10,
|
29 |
+
"mask_feature_prob": 0.015,
|
30 |
+
"mask_feature_length": 64
|
31 |
+
}
|
32 |
+
]
|
vocab.json
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"[PAD]": 0,
|
3 |
+
"aa": 1,
|
4 |
+
"ae": 2,
|
5 |
+
"ah": 3,
|
6 |
+
"ao": 4,
|
7 |
+
"aw": 5,
|
8 |
+
"ay": 6,
|
9 |
+
"b": 7,
|
10 |
+
"ch": 8,
|
11 |
+
"d": 9,
|
12 |
+
"dh": 10,
|
13 |
+
"eh": 11,
|
14 |
+
"er": 12,
|
15 |
+
"err": 41,
|
16 |
+
"ey": 13,
|
17 |
+
"f": 14,
|
18 |
+
"g": 15,
|
19 |
+
"hh": 16,
|
20 |
+
"ih": 17,
|
21 |
+
"iy": 18,
|
22 |
+
"jh": 19,
|
23 |
+
"k": 20,
|
24 |
+
"l": 21,
|
25 |
+
"m": 22,
|
26 |
+
"n": 23,
|
27 |
+
"ng": 24,
|
28 |
+
"ow": 25,
|
29 |
+
"oy": 26,
|
30 |
+
"p": 27,
|
31 |
+
"r": 28,
|
32 |
+
"s": 29,
|
33 |
+
"sh": 30,
|
34 |
+
"sil": 31,
|
35 |
+
"t": 32,
|
36 |
+
"th": 33,
|
37 |
+
"uh": 34,
|
38 |
+
"uw": 35,
|
39 |
+
"v": 36,
|
40 |
+
"w": 37,
|
41 |
+
"y": 38,
|
42 |
+
"z": 39,
|
43 |
+
"zh": 40
|
44 |
+
}
|