sidleal commited on
Commit
eab7921
1 Parent(s): 1676084

Upload 11 files

Browse files
config.json ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "Edresson/wav2vec2-large-xlsr-coraa-portuguese",
3
+ "activation_dropout": 0.0,
4
+ "adapter_attn_dim": null,
5
+ "adapter_kernel_size": 3,
6
+ "adapter_stride": 2,
7
+ "add_adapter": false,
8
+ "apply_spec_augment": true,
9
+ "architectures": [
10
+ "Wav2Vec2ForCTC"
11
+ ],
12
+ "attention_dropout": 0.1,
13
+ "bos_token_id": 1,
14
+ "classifier_proj_size": 256,
15
+ "codevector_dim": 768,
16
+ "contrastive_logits_temperature": 0.1,
17
+ "conv_bias": true,
18
+ "conv_dim": [
19
+ 512,
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512,
24
+ 512,
25
+ 512
26
+ ],
27
+ "conv_kernel": [
28
+ 10,
29
+ 3,
30
+ 3,
31
+ 3,
32
+ 3,
33
+ 2,
34
+ 2
35
+ ],
36
+ "conv_stride": [
37
+ 5,
38
+ 2,
39
+ 2,
40
+ 2,
41
+ 2,
42
+ 2,
43
+ 2
44
+ ],
45
+ "ctc_loss_reduction": "mean",
46
+ "ctc_zero_infinity": true,
47
+ "diversity_loss_weight": 0.1,
48
+ "do_stable_layer_norm": true,
49
+ "eos_token_id": 2,
50
+ "feat_extract_activation": "gelu",
51
+ "feat_extract_dropout": 0.0,
52
+ "feat_extract_norm": "layer",
53
+ "feat_proj_dropout": 0.1,
54
+ "feat_quantizer_dropout": 0.0,
55
+ "final_dropout": 0.0,
56
+ "hidden_act": "gelu",
57
+ "hidden_dropout": 0.1,
58
+ "hidden_size": 1024,
59
+ "initializer_range": 0.02,
60
+ "intermediate_size": 4096,
61
+ "layer_norm_eps": 1e-05,
62
+ "layerdrop": 0.0,
63
+ "mask_channel_length": 10,
64
+ "mask_channel_min_space": 1,
65
+ "mask_channel_other": 0.0,
66
+ "mask_channel_prob": 0.0,
67
+ "mask_channel_selection": "static",
68
+ "mask_feature_length": 10,
69
+ "mask_feature_min_masks": 0,
70
+ "mask_feature_prob": 0.0,
71
+ "mask_time_length": 10,
72
+ "mask_time_min_masks": 2,
73
+ "mask_time_min_space": 1,
74
+ "mask_time_other": 0.0,
75
+ "mask_time_prob": 0.05,
76
+ "mask_time_selection": "static",
77
+ "model_type": "wav2vec2",
78
+ "num_adapter_layers": 3,
79
+ "num_attention_heads": 16,
80
+ "num_codevector_groups": 2,
81
+ "num_codevectors_per_group": 320,
82
+ "num_conv_pos_embedding_groups": 16,
83
+ "num_conv_pos_embeddings": 128,
84
+ "num_feat_extract_layers": 7,
85
+ "num_hidden_layers": 24,
86
+ "num_negatives": 100,
87
+ "output_hidden_size": 1024,
88
+ "pad_token_id": 0,
89
+ "proj_codevector_dim": 768,
90
+ "tdnn_dilation": [
91
+ 1,
92
+ 2,
93
+ 3,
94
+ 1,
95
+ 1
96
+ ],
97
+ "tdnn_dim": [
98
+ 512,
99
+ 512,
100
+ 512,
101
+ 512,
102
+ 1500
103
+ ],
104
+ "tdnn_kernel": [
105
+ 5,
106
+ 3,
107
+ 3,
108
+ 1,
109
+ 1
110
+ ],
111
+ "torch_dtype": "float32",
112
+ "transformers_version": "4.39.3",
113
+ "use_weighted_layer_sum": false,
114
+ "vocab_size": 45,
115
+ "xvector_output_dim": 512
116
+ }
config_train.json ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "run_name": "Wav2Vec-fine-tuning-NURC-SP",
3
+ "run_description": "Fine tuning NURC-SP",
4
+ "seed": 42,
5
+ // AUDIO PARAMS
6
+ "sampling_rate": 16000,
7
+
8
+ // VOCABULARY PARAMETERS
9
+ "vocab":{
10
+ "vocab_path": "example/vocab_example.json", // generic vocab for Portuguese
11
+ "blank": "<pad>", // blank token for padding
12
+ "silence": "|", // token between words
13
+ "unk": "<unk>" // unk token
14
+ },
15
+
16
+ // TRAINING
17
+ "batch_size": 8, // Batch size for training.
18
+ "mixed_precision": true, // level of optimization with NVIDIA's apex feature for automatic mixed FP16/FP32 precision (AMP), NOTE: currently only O1 is supported, and use "O1" to activate.
19
+ "early_stop_epochs": 10, // If 0 disabled else Number of epochs for stop training with validation loss dont decrease
20
+ "preprocess_dataset": false, // if true, the dataset will be pre-processed and saved in disk, otherwise the audio files will be loaded in each step. Preprocessing makes training faster, but requires much more disk space.
21
+
22
+ // OPTIMIZER
23
+ "epochs": 50, // total number of epochs to train.
24
+ "lr": 0.00003, // Initial learning rate.
25
+ "gradient_accumulation_steps": 24,
26
+
27
+ // LOGGING
28
+ "logging_steps": 100, // Number of steps to plot.
29
+ "load_best_model_at_end": true,
30
+ "save_total_limit": 3,
31
+ "warmup_ratio": 0.05, // 0 disable Ratio of total training steps used for a linear warmup from 0 to learning_rate
32
+ "warmup_steps": 0, // 0 disable Number of steps used for a linear warmup from 0 to learning_rate
33
+
34
+ // DATA LOADING
35
+ "num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are goo
36
+
37
+ // MODEL
38
+ "freeze_feature_encoder": true, // Whether to freeze the feature encoder layers of the model.
39
+ "attention_dropout": 0.1, // The dropout ratio for the attention probabilities.
40
+ "activation_dropout": 0.1, // The dropout ratio for activations inside the fully connected layer.
41
+ "hidden_dropout": 0.1, // The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
42
+ "feat_proj_dropout": 0.1, // The dropout probabilitiy for all 1D convolutional layers in feature encoder.
43
+ "mask_time_prob": 0.05, // Propability of each feature vector along the time axis to be chosen as the start of the vector span to be masked.
44
+ "layerdrop": 0.0, // The LayerDrop probability.
45
+ "gradient_checkpointing": true, // If True, use gradient checkpointing to save memory at the expense of slower backward pass.
46
+
47
+
48
+ // PATHS
49
+ "output_path": "../checkpoints/Wav2Vec/NURC-SP/final-version/train/",
50
+ // CACHE
51
+ "dataset_cache": "../datasets/",
52
+
53
+ // DATASETS
54
+ "datasets":{
55
+ "files_path": "../datasets/NURC-SP/audios/", // relative path for audios It's will be join with the audio path CSV
56
+ "train":
57
+ [
58
+ // this dicts is pass directly for the load dataset see the documentation: https://huggingface.co/docs/datasets/package_reference/loading_methods.html#datasets.load_dataset
59
+ {
60
+ "name": "csv",
61
+ "path": "csv",
62
+ "data_files": ["../datasets/NURC-SP/corpus_2_train.csv"], // csv files
63
+ "text_column": "text",
64
+ "path_column": "file_path"
65
+ }
66
+ ]
67
+ ,
68
+ "devel":
69
+ [
70
+ {
71
+ "name": "csv",
72
+ "path": "csv",
73
+ "data_files": ["../datasets/NURC-SP/corpus_2_dev.csv"], // csv files
74
+ "text_column": "text",
75
+ "path_column": "file_path"
76
+ }
77
+
78
+ ]
79
+ }//,
80
+ // used only for test
81
+ // "KenLM":{
82
+ // "kenlm_model_path": "../../kenLM/binaries/subtitle/4-gram/lm.binary", // Path for KenLM model
83
+ // "lexicon_path": "example/lexicon.lst", // file with all words for limit the decoder search
84
+ // "beam": 2048,
85
+ // "nbest": 1,
86
+ // "beam_threshold": 25,
87
+ // "lm_weight": 1,
88
+ // "word_score": -1,
89
+ // "sil_weight": 0
90
+ // }
91
+
92
+
93
+ }
94
+
eval_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 16.0,
3
+ "eval_loss": 0.47797250747680664,
4
+ "eval_runtime": 91.602,
5
+ "eval_samples": 3135,
6
+ "eval_samples_per_second": 34.224,
7
+ "eval_steps_per_second": 4.279,
8
+ "eval_wer": 0.23347451217667523
9
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:493dd95b8bd989f4026512f5285fed16a1d130d9b822c4d79eff5320257f3f2f
3
+ size 1261991980
preprocessor_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0.0,
7
+ "processor_class": "Wav2Vec2Processor",
8
+ "return_attention_mask": true,
9
+ "sampling_rate": 16000
10
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "pad_token": "<pad>",
5
+ "unk_token": "<unk>"
6
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<pad>",
5
+ "lstrip": true,
6
+ "normalized": false,
7
+ "rstrip": true,
8
+ "single_word": false,
9
+ "special": false
10
+ },
11
+ "2": {
12
+ "content": "<unk>",
13
+ "lstrip": true,
14
+ "normalized": false,
15
+ "rstrip": true,
16
+ "single_word": false,
17
+ "special": false
18
+ },
19
+ "43": {
20
+ "content": "<s>",
21
+ "lstrip": true,
22
+ "normalized": false,
23
+ "rstrip": true,
24
+ "single_word": false,
25
+ "special": false
26
+ },
27
+ "44": {
28
+ "content": "</s>",
29
+ "lstrip": true,
30
+ "normalized": false,
31
+ "rstrip": true,
32
+ "single_word": false,
33
+ "special": false
34
+ }
35
+ },
36
+ "bos_token": "<s>",
37
+ "clean_up_tokenization_spaces": true,
38
+ "do_lower_case": false,
39
+ "eos_token": "</s>",
40
+ "model_max_length": 1000000000000000019884624838656,
41
+ "pad_token": "<pad>",
42
+ "processor_class": "Wav2Vec2Processor",
43
+ "replace_word_delimiter_char": " ",
44
+ "target_lang": null,
45
+ "tokenizer_class": "Wav2Vec2CTCTokenizer",
46
+ "unk_token": "<unk>",
47
+ "word_delimiter_token": "|"
48
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 16.0,
3
+ "train_loss": 103989713142.67763,
4
+ "train_runtime": 155022.8715,
5
+ "train_samples": 166630,
6
+ "train_samples_per_second": 53.744,
7
+ "train_steps_per_second": 0.28
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,1147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.47799625992774963,
3
+ "best_model_checkpoint": "../checkpoints/Wav2Vec/NURC-SP/final-version/train/checkpoint-5207",
4
+ "epoch": 16.0,
5
+ "eval_steps": 500,
6
+ "global_step": 13886,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0,
13
+ "grad_norm": 0.48270344734191895,
14
+ "learning_rate": 1.3837638376383764e-08,
15
+ "loss": 0.4969,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.12,
20
+ "grad_norm": NaN,
21
+ "learning_rate": 1.3699261992619927e-06,
22
+ "loss": 1.0935,
23
+ "step": 100
24
+ },
25
+ {
26
+ "epoch": 0.23,
27
+ "grad_norm": 10.196231842041016,
28
+ "learning_rate": 2.753690036900369e-06,
29
+ "loss": 1.0424,
30
+ "step": 200
31
+ },
32
+ {
33
+ "epoch": 0.35,
34
+ "grad_norm": 7.084147930145264,
35
+ "learning_rate": 4.137453874538745e-06,
36
+ "loss": 0.9727,
37
+ "step": 300
38
+ },
39
+ {
40
+ "epoch": 0.46,
41
+ "grad_norm": 4.268230438232422,
42
+ "learning_rate": 5.521217712177122e-06,
43
+ "loss": 0.9538,
44
+ "step": 400
45
+ },
46
+ {
47
+ "epoch": 0.58,
48
+ "grad_norm": 7.1523942947387695,
49
+ "learning_rate": 6.904981549815498e-06,
50
+ "loss": 0.941,
51
+ "step": 500
52
+ },
53
+ {
54
+ "epoch": 0.69,
55
+ "grad_norm": 7.721753120422363,
56
+ "learning_rate": 8.288745387453875e-06,
57
+ "loss": 0.9294,
58
+ "step": 600
59
+ },
60
+ {
61
+ "epoch": 0.81,
62
+ "grad_norm": 6.676694393157959,
63
+ "learning_rate": 9.67250922509225e-06,
64
+ "loss": 0.9104,
65
+ "step": 700
66
+ },
67
+ {
68
+ "epoch": 0.92,
69
+ "grad_norm": 13.917472839355469,
70
+ "learning_rate": 1.1056273062730627e-05,
71
+ "loss": 0.9171,
72
+ "step": 800
73
+ },
74
+ {
75
+ "epoch": 1.0,
76
+ "eval_loss": 0.5722900629043579,
77
+ "eval_runtime": 102.9565,
78
+ "eval_samples_per_second": 30.45,
79
+ "eval_steps_per_second": 3.807,
80
+ "eval_wer": 0.25805475722281046,
81
+ "step": 867
82
+ },
83
+ {
84
+ "epoch": 1.04,
85
+ "grad_norm": 1.1242845058441162,
86
+ "learning_rate": 1.2440036900369004e-05,
87
+ "loss": 0.8226,
88
+ "step": 900
89
+ },
90
+ {
91
+ "epoch": 1.15,
92
+ "grad_norm": 1.4485478401184082,
93
+ "learning_rate": 1.382380073800738e-05,
94
+ "loss": 0.8966,
95
+ "step": 1000
96
+ },
97
+ {
98
+ "epoch": 1.27,
99
+ "grad_norm": 1.5878037214279175,
100
+ "learning_rate": 1.5207564575645757e-05,
101
+ "loss": 0.881,
102
+ "step": 1100
103
+ },
104
+ {
105
+ "epoch": 1.38,
106
+ "grad_norm": 0.9301995635032654,
107
+ "learning_rate": 1.6591328413284134e-05,
108
+ "loss": 0.8691,
109
+ "step": 1200
110
+ },
111
+ {
112
+ "epoch": 1.5,
113
+ "grad_norm": 1.2811486721038818,
114
+ "learning_rate": 1.797509225092251e-05,
115
+ "loss": 0.8756,
116
+ "step": 1300
117
+ },
118
+ {
119
+ "epoch": 1.61,
120
+ "grad_norm": 1.5193356275558472,
121
+ "learning_rate": 1.9358856088560884e-05,
122
+ "loss": 0.8687,
123
+ "step": 1400
124
+ },
125
+ {
126
+ "epoch": 1.73,
127
+ "grad_norm": 1.1962685585021973,
128
+ "learning_rate": 2.072878228782288e-05,
129
+ "loss": 0.8588,
130
+ "step": 1500
131
+ },
132
+ {
133
+ "epoch": 1.84,
134
+ "grad_norm": 1.906406044960022,
135
+ "learning_rate": 2.2112546125461254e-05,
136
+ "loss": 0.8614,
137
+ "step": 1600
138
+ },
139
+ {
140
+ "epoch": 1.96,
141
+ "grad_norm": 2.207412004470825,
142
+ "learning_rate": 2.349630996309963e-05,
143
+ "loss": 0.8574,
144
+ "step": 1700
145
+ },
146
+ {
147
+ "epoch": 2.0,
148
+ "eval_loss": 0.5288712978363037,
149
+ "eval_runtime": 107.9908,
150
+ "eval_samples_per_second": 29.03,
151
+ "eval_steps_per_second": 3.63,
152
+ "eval_wer": 0.254853022739878,
153
+ "step": 1735
154
+ },
155
+ {
156
+ "epoch": 2.07,
157
+ "grad_norm": 1.1061471700668335,
158
+ "learning_rate": 2.4880073800738008e-05,
159
+ "loss": 0.8829,
160
+ "step": 1800
161
+ },
162
+ {
163
+ "epoch": 2.19,
164
+ "grad_norm": 0.760769784450531,
165
+ "learning_rate": 2.6263837638376385e-05,
166
+ "loss": 0.8459,
167
+ "step": 1900
168
+ },
169
+ {
170
+ "epoch": 2.3,
171
+ "grad_norm": 1.408493161201477,
172
+ "learning_rate": 2.764760147601476e-05,
173
+ "loss": 0.8378,
174
+ "step": 2000
175
+ },
176
+ {
177
+ "epoch": 2.42,
178
+ "grad_norm": 0.9650698304176331,
179
+ "learning_rate": 2.9031365313653138e-05,
180
+ "loss": 0.8415,
181
+ "step": 2100
182
+ },
183
+ {
184
+ "epoch": 2.53,
185
+ "grad_norm": 0.9561858177185059,
186
+ "learning_rate": 2.9978145791850807e-05,
187
+ "loss": 0.8364,
188
+ "step": 2200
189
+ },
190
+ {
191
+ "epoch": 2.65,
192
+ "grad_norm": 0.7619096040725708,
193
+ "learning_rate": 2.9905298431353505e-05,
194
+ "loss": 0.8364,
195
+ "step": 2300
196
+ },
197
+ {
198
+ "epoch": 2.77,
199
+ "grad_norm": 1.0944970846176147,
200
+ "learning_rate": 2.9833179544461174e-05,
201
+ "loss": 0.8256,
202
+ "step": 2400
203
+ },
204
+ {
205
+ "epoch": 2.88,
206
+ "grad_norm": 0.7943573594093323,
207
+ "learning_rate": 2.9760332183963868e-05,
208
+ "loss": 0.8175,
209
+ "step": 2500
210
+ },
211
+ {
212
+ "epoch": 3.0,
213
+ "grad_norm": 2.035665988922119,
214
+ "learning_rate": 2.9688213297071538e-05,
215
+ "loss": 0.8533,
216
+ "step": 2600
217
+ },
218
+ {
219
+ "epoch": 3.0,
220
+ "eval_loss": 0.5230820178985596,
221
+ "eval_runtime": 99.7653,
222
+ "eval_samples_per_second": 31.424,
223
+ "eval_steps_per_second": 3.929,
224
+ "eval_wer": 0.2511722886098926,
225
+ "step": 2603
226
+ },
227
+ {
228
+ "epoch": 3.11,
229
+ "grad_norm": 1.9500775337219238,
230
+ "learning_rate": 2.9615365936574232e-05,
231
+ "loss": 0.8022,
232
+ "step": 2700
233
+ },
234
+ {
235
+ "epoch": 3.23,
236
+ "grad_norm": 2.2833547592163086,
237
+ "learning_rate": 2.954251857607693e-05,
238
+ "loss": 0.8143,
239
+ "step": 2800
240
+ },
241
+ {
242
+ "epoch": 3.34,
243
+ "grad_norm": 1.7430369853973389,
244
+ "learning_rate": 2.9469671215579623e-05,
245
+ "loss": 0.8134,
246
+ "step": 2900
247
+ },
248
+ {
249
+ "epoch": 3.46,
250
+ "grad_norm": 1.7504712343215942,
251
+ "learning_rate": 2.9396823855082317e-05,
252
+ "loss": 0.8026,
253
+ "step": 3000
254
+ },
255
+ {
256
+ "epoch": 3.57,
257
+ "grad_norm": 1.6993814706802368,
258
+ "learning_rate": 2.9323976494585015e-05,
259
+ "loss": 0.8098,
260
+ "step": 3100
261
+ },
262
+ {
263
+ "epoch": 3.69,
264
+ "grad_norm": 3.484405279159546,
265
+ "learning_rate": 2.925112913408771e-05,
266
+ "loss": 0.812,
267
+ "step": 3200
268
+ },
269
+ {
270
+ "epoch": 3.8,
271
+ "grad_norm": 2.0759809017181396,
272
+ "learning_rate": 2.917901024719538e-05,
273
+ "loss": 0.8121,
274
+ "step": 3300
275
+ },
276
+ {
277
+ "epoch": 3.92,
278
+ "grad_norm": 2.325293779373169,
279
+ "learning_rate": 2.9106162886698072e-05,
280
+ "loss": 0.8076,
281
+ "step": 3400
282
+ },
283
+ {
284
+ "epoch": 4.0,
285
+ "eval_loss": 0.5064914226531982,
286
+ "eval_runtime": 117.608,
287
+ "eval_samples_per_second": 26.656,
288
+ "eval_steps_per_second": 3.333,
289
+ "eval_wer": 0.23836535067816267,
290
+ "step": 3471
291
+ },
292
+ {
293
+ "epoch": 4.03,
294
+ "grad_norm": 1.8136922121047974,
295
+ "learning_rate": 2.9033315526200766e-05,
296
+ "loss": 0.7724,
297
+ "step": 3500
298
+ },
299
+ {
300
+ "epoch": 4.15,
301
+ "grad_norm": 7.821990013122559,
302
+ "learning_rate": 2.8961196639308436e-05,
303
+ "loss": 0.7879,
304
+ "step": 3600
305
+ },
306
+ {
307
+ "epoch": 4.26,
308
+ "grad_norm": 0.916205108165741,
309
+ "learning_rate": 2.888834927881113e-05,
310
+ "loss": 0.7907,
311
+ "step": 3700
312
+ },
313
+ {
314
+ "epoch": 4.38,
315
+ "grad_norm": 1.157679557800293,
316
+ "learning_rate": 2.8815501918313824e-05,
317
+ "loss": 0.7859,
318
+ "step": 3800
319
+ },
320
+ {
321
+ "epoch": 4.49,
322
+ "grad_norm": 1.1060450077056885,
323
+ "learning_rate": 2.874265455781652e-05,
324
+ "loss": 0.7934,
325
+ "step": 3900
326
+ },
327
+ {
328
+ "epoch": 4.61,
329
+ "grad_norm": 1.0524017810821533,
330
+ "learning_rate": 2.8669807197319215e-05,
331
+ "loss": 0.7887,
332
+ "step": 4000
333
+ },
334
+ {
335
+ "epoch": 4.72,
336
+ "grad_norm": 0.9493302702903748,
337
+ "learning_rate": 2.8596959836821913e-05,
338
+ "loss": 0.7962,
339
+ "step": 4100
340
+ },
341
+ {
342
+ "epoch": 4.84,
343
+ "grad_norm": 1.216361403465271,
344
+ "learning_rate": 2.8524840949929582e-05,
345
+ "loss": 0.7902,
346
+ "step": 4200
347
+ },
348
+ {
349
+ "epoch": 4.95,
350
+ "grad_norm": 0.858220100402832,
351
+ "learning_rate": 2.8451993589432277e-05,
352
+ "loss": 0.7875,
353
+ "step": 4300
354
+ },
355
+ {
356
+ "epoch": 5.0,
357
+ "eval_loss": 0.4950461685657501,
358
+ "eval_runtime": 107.5304,
359
+ "eval_samples_per_second": 29.155,
360
+ "eval_steps_per_second": 3.645,
361
+ "eval_wer": 0.23864266626329855,
362
+ "step": 4339
363
+ },
364
+ {
365
+ "epoch": 5.07,
366
+ "grad_norm": 0.6915091872215271,
367
+ "learning_rate": 2.8379146228934974e-05,
368
+ "loss": 0.8172,
369
+ "step": 4400
370
+ },
371
+ {
372
+ "epoch": 5.19,
373
+ "grad_norm": 0.8395456671714783,
374
+ "learning_rate": 2.8306298868437668e-05,
375
+ "loss": 0.7719,
376
+ "step": 4500
377
+ },
378
+ {
379
+ "epoch": 5.3,
380
+ "grad_norm": 0.5675917863845825,
381
+ "learning_rate": 2.8233451507940365e-05,
382
+ "loss": 0.7725,
383
+ "step": 4600
384
+ },
385
+ {
386
+ "epoch": 5.42,
387
+ "grad_norm": 0.7588133215904236,
388
+ "learning_rate": 2.816060414744306e-05,
389
+ "loss": 0.7735,
390
+ "step": 4700
391
+ },
392
+ {
393
+ "epoch": 5.53,
394
+ "grad_norm": 0.7658157348632812,
395
+ "learning_rate": 2.8087756786945757e-05,
396
+ "loss": 0.7713,
397
+ "step": 4800
398
+ },
399
+ {
400
+ "epoch": 5.65,
401
+ "grad_norm": 0.6394296288490295,
402
+ "learning_rate": 2.801490942644845e-05,
403
+ "loss": 0.7683,
404
+ "step": 4900
405
+ },
406
+ {
407
+ "epoch": 5.76,
408
+ "grad_norm": 1.0770230293273926,
409
+ "learning_rate": 2.794206206595114e-05,
410
+ "loss": 0.784,
411
+ "step": 5000
412
+ },
413
+ {
414
+ "epoch": 5.88,
415
+ "grad_norm": 0.537818193435669,
416
+ "learning_rate": 2.786921470545384e-05,
417
+ "loss": 0.7705,
418
+ "step": 5100
419
+ },
420
+ {
421
+ "epoch": 5.99,
422
+ "grad_norm": 1.1256853342056274,
423
+ "learning_rate": 2.7796367344956533e-05,
424
+ "loss": 0.7947,
425
+ "step": 5200
426
+ },
427
+ {
428
+ "epoch": 6.0,
429
+ "eval_loss": 0.47799625992774963,
430
+ "eval_runtime": 102.2073,
431
+ "eval_samples_per_second": 30.673,
432
+ "eval_steps_per_second": 3.835,
433
+ "eval_wer": 0.23347451217667523,
434
+ "step": 5207
435
+ },
436
+ {
437
+ "epoch": 6.11,
438
+ "grad_norm": 2.259537935256958,
439
+ "learning_rate": 2.772351998445923e-05,
440
+ "loss": 0.7386,
441
+ "step": 5300
442
+ },
443
+ {
444
+ "epoch": 6.22,
445
+ "grad_norm": 1.8576114177703857,
446
+ "learning_rate": 2.7650672623961924e-05,
447
+ "loss": 0.7591,
448
+ "step": 5400
449
+ },
450
+ {
451
+ "epoch": 6.34,
452
+ "grad_norm": 3.029550075531006,
453
+ "learning_rate": 2.7578553737069594e-05,
454
+ "loss": 0.765,
455
+ "step": 5500
456
+ },
457
+ {
458
+ "epoch": 6.45,
459
+ "grad_norm": 1.3059985637664795,
460
+ "learning_rate": 2.750570637657229e-05,
461
+ "loss": 0.7484,
462
+ "step": 5600
463
+ },
464
+ {
465
+ "epoch": 6.57,
466
+ "grad_norm": 2.597036600112915,
467
+ "learning_rate": 2.7432859016074985e-05,
468
+ "loss": 0.7696,
469
+ "step": 5700
470
+ },
471
+ {
472
+ "epoch": 6.68,
473
+ "grad_norm": 1.8154231309890747,
474
+ "learning_rate": 2.736074012918265e-05,
475
+ "loss": 0.7642,
476
+ "step": 5800
477
+ },
478
+ {
479
+ "epoch": 6.8,
480
+ "grad_norm": 1.9362813234329224,
481
+ "learning_rate": 2.728789276868535e-05,
482
+ "loss": 0.7703,
483
+ "step": 5900
484
+ },
485
+ {
486
+ "epoch": 6.91,
487
+ "grad_norm": 2.5824599266052246,
488
+ "learning_rate": 2.7215045408188043e-05,
489
+ "loss": 0.7621,
490
+ "step": 6000
491
+ },
492
+ {
493
+ "epoch": 7.0,
494
+ "eval_loss": 0.4984392821788788,
495
+ "eval_runtime": 96.472,
496
+ "eval_samples_per_second": 32.496,
497
+ "eval_steps_per_second": 4.063,
498
+ "eval_wer": 0.23700398326022287,
499
+ "step": 6075
500
+ },
501
+ {
502
+ "epoch": 7.03,
503
+ "grad_norm": 1.1438063383102417,
504
+ "learning_rate": 2.714219804769074e-05,
505
+ "loss": 0.7505,
506
+ "step": 6100
507
+ },
508
+ {
509
+ "epoch": 7.14,
510
+ "grad_norm": 0.8449379205703735,
511
+ "learning_rate": 2.7069350687193434e-05,
512
+ "loss": 0.7626,
513
+ "step": 6200
514
+ },
515
+ {
516
+ "epoch": 7.26,
517
+ "grad_norm": 1.075359582901001,
518
+ "learning_rate": 2.699650332669613e-05,
519
+ "loss": 0.7503,
520
+ "step": 6300
521
+ },
522
+ {
523
+ "epoch": 7.37,
524
+ "grad_norm": NaN,
525
+ "learning_rate": 2.697756301296683e-05,
526
+ "loss": 14440011566940.16,
527
+ "step": 6400
528
+ },
529
+ {
530
+ "epoch": 7.49,
531
+ "grad_norm": NaN,
532
+ "learning_rate": 2.697756301296683e-05,
533
+ "loss": 0.0,
534
+ "step": 6500
535
+ },
536
+ {
537
+ "epoch": 7.6,
538
+ "grad_norm": NaN,
539
+ "learning_rate": 2.697756301296683e-05,
540
+ "loss": 0.0,
541
+ "step": 6600
542
+ },
543
+ {
544
+ "epoch": 7.72,
545
+ "grad_norm": NaN,
546
+ "learning_rate": 2.697756301296683e-05,
547
+ "loss": 0.0,
548
+ "step": 6700
549
+ },
550
+ {
551
+ "epoch": 7.84,
552
+ "grad_norm": NaN,
553
+ "learning_rate": 2.697756301296683e-05,
554
+ "loss": 0.0,
555
+ "step": 6800
556
+ },
557
+ {
558
+ "epoch": 7.95,
559
+ "grad_norm": NaN,
560
+ "learning_rate": 2.697756301296683e-05,
561
+ "loss": 0.0,
562
+ "step": 6900
563
+ },
564
+ {
565
+ "epoch": 8.0,
566
+ "eval_loss": NaN,
567
+ "eval_runtime": 97.0769,
568
+ "eval_samples_per_second": 32.294,
569
+ "eval_steps_per_second": 4.038,
570
+ "eval_wer": 1.0,
571
+ "step": 6943
572
+ },
573
+ {
574
+ "epoch": 8.07,
575
+ "grad_norm": NaN,
576
+ "learning_rate": 2.697756301296683e-05,
577
+ "loss": 0.0,
578
+ "step": 7000
579
+ },
580
+ {
581
+ "epoch": 8.18,
582
+ "grad_norm": NaN,
583
+ "learning_rate": 2.697756301296683e-05,
584
+ "loss": 0.0,
585
+ "step": 7100
586
+ },
587
+ {
588
+ "epoch": 8.3,
589
+ "grad_norm": NaN,
590
+ "learning_rate": 2.697756301296683e-05,
591
+ "loss": 0.0,
592
+ "step": 7200
593
+ },
594
+ {
595
+ "epoch": 8.41,
596
+ "grad_norm": NaN,
597
+ "learning_rate": 2.697756301296683e-05,
598
+ "loss": 0.0,
599
+ "step": 7300
600
+ },
601
+ {
602
+ "epoch": 8.53,
603
+ "grad_norm": NaN,
604
+ "learning_rate": 2.697756301296683e-05,
605
+ "loss": 0.0,
606
+ "step": 7400
607
+ },
608
+ {
609
+ "epoch": 8.64,
610
+ "grad_norm": NaN,
611
+ "learning_rate": 2.697756301296683e-05,
612
+ "loss": 0.0,
613
+ "step": 7500
614
+ },
615
+ {
616
+ "epoch": 8.76,
617
+ "grad_norm": NaN,
618
+ "learning_rate": 2.697756301296683e-05,
619
+ "loss": 0.0,
620
+ "step": 7600
621
+ },
622
+ {
623
+ "epoch": 8.87,
624
+ "grad_norm": NaN,
625
+ "learning_rate": 2.697756301296683e-05,
626
+ "loss": 0.0,
627
+ "step": 7700
628
+ },
629
+ {
630
+ "epoch": 8.99,
631
+ "grad_norm": NaN,
632
+ "learning_rate": 2.697756301296683e-05,
633
+ "loss": 0.0,
634
+ "step": 7800
635
+ },
636
+ {
637
+ "epoch": 9.0,
638
+ "eval_loss": NaN,
639
+ "eval_runtime": 108.3909,
640
+ "eval_samples_per_second": 28.923,
641
+ "eval_steps_per_second": 3.617,
642
+ "eval_wer": 1.0,
643
+ "step": 7810
644
+ },
645
+ {
646
+ "epoch": 9.1,
647
+ "grad_norm": NaN,
648
+ "learning_rate": 2.697756301296683e-05,
649
+ "loss": 0.0,
650
+ "step": 7900
651
+ },
652
+ {
653
+ "epoch": 9.22,
654
+ "grad_norm": NaN,
655
+ "learning_rate": 2.697756301296683e-05,
656
+ "loss": 0.0,
657
+ "step": 8000
658
+ },
659
+ {
660
+ "epoch": 9.33,
661
+ "grad_norm": NaN,
662
+ "learning_rate": 2.697756301296683e-05,
663
+ "loss": 0.0,
664
+ "step": 8100
665
+ },
666
+ {
667
+ "epoch": 9.45,
668
+ "grad_norm": NaN,
669
+ "learning_rate": 2.697756301296683e-05,
670
+ "loss": 0.0,
671
+ "step": 8200
672
+ },
673
+ {
674
+ "epoch": 9.56,
675
+ "grad_norm": NaN,
676
+ "learning_rate": 2.697756301296683e-05,
677
+ "loss": 0.0,
678
+ "step": 8300
679
+ },
680
+ {
681
+ "epoch": 9.68,
682
+ "grad_norm": NaN,
683
+ "learning_rate": 2.697756301296683e-05,
684
+ "loss": 0.0,
685
+ "step": 8400
686
+ },
687
+ {
688
+ "epoch": 9.79,
689
+ "grad_norm": NaN,
690
+ "learning_rate": 2.697756301296683e-05,
691
+ "loss": 0.0,
692
+ "step": 8500
693
+ },
694
+ {
695
+ "epoch": 9.91,
696
+ "grad_norm": NaN,
697
+ "learning_rate": 2.697756301296683e-05,
698
+ "loss": 0.0,
699
+ "step": 8600
700
+ },
701
+ {
702
+ "epoch": 10.0,
703
+ "eval_loss": NaN,
704
+ "eval_runtime": 107.8221,
705
+ "eval_samples_per_second": 29.076,
706
+ "eval_steps_per_second": 3.636,
707
+ "eval_wer": 1.0,
708
+ "step": 8678
709
+ },
710
+ {
711
+ "epoch": 10.02,
712
+ "grad_norm": NaN,
713
+ "learning_rate": 2.697756301296683e-05,
714
+ "loss": 0.0,
715
+ "step": 8700
716
+ },
717
+ {
718
+ "epoch": 10.14,
719
+ "grad_norm": NaN,
720
+ "learning_rate": 2.697756301296683e-05,
721
+ "loss": 0.0,
722
+ "step": 8800
723
+ },
724
+ {
725
+ "epoch": 10.25,
726
+ "grad_norm": NaN,
727
+ "learning_rate": 2.697756301296683e-05,
728
+ "loss": 0.0,
729
+ "step": 8900
730
+ },
731
+ {
732
+ "epoch": 10.37,
733
+ "grad_norm": NaN,
734
+ "learning_rate": 2.697756301296683e-05,
735
+ "loss": 0.0,
736
+ "step": 9000
737
+ },
738
+ {
739
+ "epoch": 10.49,
740
+ "grad_norm": NaN,
741
+ "learning_rate": 2.697756301296683e-05,
742
+ "loss": 0.0,
743
+ "step": 9100
744
+ },
745
+ {
746
+ "epoch": 10.6,
747
+ "grad_norm": NaN,
748
+ "learning_rate": 2.697756301296683e-05,
749
+ "loss": 0.0,
750
+ "step": 9200
751
+ },
752
+ {
753
+ "epoch": 10.72,
754
+ "grad_norm": NaN,
755
+ "learning_rate": 2.697756301296683e-05,
756
+ "loss": 0.0,
757
+ "step": 9300
758
+ },
759
+ {
760
+ "epoch": 10.83,
761
+ "grad_norm": NaN,
762
+ "learning_rate": 2.697756301296683e-05,
763
+ "loss": 0.0,
764
+ "step": 9400
765
+ },
766
+ {
767
+ "epoch": 10.95,
768
+ "grad_norm": NaN,
769
+ "learning_rate": 2.697756301296683e-05,
770
+ "loss": 0.0,
771
+ "step": 9500
772
+ },
773
+ {
774
+ "epoch": 11.0,
775
+ "eval_loss": NaN,
776
+ "eval_runtime": 116.4634,
777
+ "eval_samples_per_second": 26.918,
778
+ "eval_steps_per_second": 3.366,
779
+ "eval_wer": 1.0,
780
+ "step": 9546
781
+ },
782
+ {
783
+ "epoch": 11.06,
784
+ "grad_norm": NaN,
785
+ "learning_rate": 2.697756301296683e-05,
786
+ "loss": 0.0,
787
+ "step": 9600
788
+ },
789
+ {
790
+ "epoch": 11.18,
791
+ "grad_norm": NaN,
792
+ "learning_rate": 2.697756301296683e-05,
793
+ "loss": 0.0,
794
+ "step": 9700
795
+ },
796
+ {
797
+ "epoch": 11.29,
798
+ "grad_norm": NaN,
799
+ "learning_rate": 2.697756301296683e-05,
800
+ "loss": 0.0,
801
+ "step": 9800
802
+ },
803
+ {
804
+ "epoch": 11.41,
805
+ "grad_norm": NaN,
806
+ "learning_rate": 2.697756301296683e-05,
807
+ "loss": 0.0,
808
+ "step": 9900
809
+ },
810
+ {
811
+ "epoch": 11.52,
812
+ "grad_norm": NaN,
813
+ "learning_rate": 2.697756301296683e-05,
814
+ "loss": 0.0,
815
+ "step": 10000
816
+ },
817
+ {
818
+ "epoch": 11.64,
819
+ "grad_norm": NaN,
820
+ "learning_rate": 2.697756301296683e-05,
821
+ "loss": 0.0,
822
+ "step": 10100
823
+ },
824
+ {
825
+ "epoch": 11.75,
826
+ "grad_norm": NaN,
827
+ "learning_rate": 2.697756301296683e-05,
828
+ "loss": 0.0,
829
+ "step": 10200
830
+ },
831
+ {
832
+ "epoch": 11.87,
833
+ "grad_norm": NaN,
834
+ "learning_rate": 2.697756301296683e-05,
835
+ "loss": 0.0,
836
+ "step": 10300
837
+ },
838
+ {
839
+ "epoch": 11.98,
840
+ "grad_norm": NaN,
841
+ "learning_rate": 2.697756301296683e-05,
842
+ "loss": 0.0,
843
+ "step": 10400
844
+ },
845
+ {
846
+ "epoch": 12.0,
847
+ "eval_loss": NaN,
848
+ "eval_runtime": 114.432,
849
+ "eval_samples_per_second": 27.396,
850
+ "eval_steps_per_second": 3.426,
851
+ "eval_wer": 1.0,
852
+ "step": 10414
853
+ },
854
+ {
855
+ "epoch": 12.1,
856
+ "grad_norm": NaN,
857
+ "learning_rate": 2.697756301296683e-05,
858
+ "loss": 0.0,
859
+ "step": 10500
860
+ },
861
+ {
862
+ "epoch": 12.21,
863
+ "grad_norm": NaN,
864
+ "learning_rate": 2.697756301296683e-05,
865
+ "loss": 0.0,
866
+ "step": 10600
867
+ },
868
+ {
869
+ "epoch": 12.33,
870
+ "grad_norm": NaN,
871
+ "learning_rate": 2.697756301296683e-05,
872
+ "loss": 0.0,
873
+ "step": 10700
874
+ },
875
+ {
876
+ "epoch": 12.44,
877
+ "grad_norm": NaN,
878
+ "learning_rate": 2.697756301296683e-05,
879
+ "loss": 0.0,
880
+ "step": 10800
881
+ },
882
+ {
883
+ "epoch": 12.56,
884
+ "grad_norm": NaN,
885
+ "learning_rate": 2.697756301296683e-05,
886
+ "loss": 0.0,
887
+ "step": 10900
888
+ },
889
+ {
890
+ "epoch": 12.67,
891
+ "grad_norm": NaN,
892
+ "learning_rate": 2.697756301296683e-05,
893
+ "loss": 0.0,
894
+ "step": 11000
895
+ },
896
+ {
897
+ "epoch": 12.79,
898
+ "grad_norm": NaN,
899
+ "learning_rate": 2.697756301296683e-05,
900
+ "loss": 0.0,
901
+ "step": 11100
902
+ },
903
+ {
904
+ "epoch": 12.91,
905
+ "grad_norm": NaN,
906
+ "learning_rate": 2.697756301296683e-05,
907
+ "loss": 0.0,
908
+ "step": 11200
909
+ },
910
+ {
911
+ "epoch": 13.0,
912
+ "eval_loss": NaN,
913
+ "eval_runtime": 107.4788,
914
+ "eval_samples_per_second": 29.169,
915
+ "eval_steps_per_second": 3.647,
916
+ "eval_wer": 1.0,
917
+ "step": 11282
918
+ },
919
+ {
920
+ "epoch": 13.02,
921
+ "grad_norm": NaN,
922
+ "learning_rate": 2.697756301296683e-05,
923
+ "loss": 0.0,
924
+ "step": 11300
925
+ },
926
+ {
927
+ "epoch": 13.14,
928
+ "grad_norm": NaN,
929
+ "learning_rate": 2.697756301296683e-05,
930
+ "loss": 0.0,
931
+ "step": 11400
932
+ },
933
+ {
934
+ "epoch": 13.25,
935
+ "grad_norm": NaN,
936
+ "learning_rate": 2.697756301296683e-05,
937
+ "loss": 0.0,
938
+ "step": 11500
939
+ },
940
+ {
941
+ "epoch": 13.37,
942
+ "grad_norm": NaN,
943
+ "learning_rate": 2.697756301296683e-05,
944
+ "loss": 0.0,
945
+ "step": 11600
946
+ },
947
+ {
948
+ "epoch": 13.48,
949
+ "grad_norm": NaN,
950
+ "learning_rate": 2.697756301296683e-05,
951
+ "loss": 0.0,
952
+ "step": 11700
953
+ },
954
+ {
955
+ "epoch": 13.6,
956
+ "grad_norm": NaN,
957
+ "learning_rate": 2.697756301296683e-05,
958
+ "loss": 0.0,
959
+ "step": 11800
960
+ },
961
+ {
962
+ "epoch": 13.71,
963
+ "grad_norm": NaN,
964
+ "learning_rate": 2.697756301296683e-05,
965
+ "loss": 0.0,
966
+ "step": 11900
967
+ },
968
+ {
969
+ "epoch": 13.83,
970
+ "grad_norm": NaN,
971
+ "learning_rate": 2.697756301296683e-05,
972
+ "loss": 0.0,
973
+ "step": 12000
974
+ },
975
+ {
976
+ "epoch": 13.94,
977
+ "grad_norm": NaN,
978
+ "learning_rate": 2.697756301296683e-05,
979
+ "loss": 0.0,
980
+ "step": 12100
981
+ },
982
+ {
983
+ "epoch": 14.0,
984
+ "eval_loss": NaN,
985
+ "eval_runtime": 106.6864,
986
+ "eval_samples_per_second": 29.385,
987
+ "eval_steps_per_second": 3.674,
988
+ "eval_wer": 1.0,
989
+ "step": 12150
990
+ },
991
+ {
992
+ "epoch": 14.06,
993
+ "grad_norm": NaN,
994
+ "learning_rate": 2.697756301296683e-05,
995
+ "loss": 0.0,
996
+ "step": 12200
997
+ },
998
+ {
999
+ "epoch": 14.17,
1000
+ "grad_norm": NaN,
1001
+ "learning_rate": 2.697756301296683e-05,
1002
+ "loss": 0.0,
1003
+ "step": 12300
1004
+ },
1005
+ {
1006
+ "epoch": 14.29,
1007
+ "grad_norm": NaN,
1008
+ "learning_rate": 2.697756301296683e-05,
1009
+ "loss": 0.0,
1010
+ "step": 12400
1011
+ },
1012
+ {
1013
+ "epoch": 14.4,
1014
+ "grad_norm": NaN,
1015
+ "learning_rate": 2.697756301296683e-05,
1016
+ "loss": 0.0,
1017
+ "step": 12500
1018
+ },
1019
+ {
1020
+ "epoch": 14.52,
1021
+ "grad_norm": NaN,
1022
+ "learning_rate": 2.697756301296683e-05,
1023
+ "loss": 0.0,
1024
+ "step": 12600
1025
+ },
1026
+ {
1027
+ "epoch": 14.63,
1028
+ "grad_norm": NaN,
1029
+ "learning_rate": 2.697756301296683e-05,
1030
+ "loss": 0.0,
1031
+ "step": 12700
1032
+ },
1033
+ {
1034
+ "epoch": 14.75,
1035
+ "grad_norm": NaN,
1036
+ "learning_rate": 2.697756301296683e-05,
1037
+ "loss": 0.0,
1038
+ "step": 12800
1039
+ },
1040
+ {
1041
+ "epoch": 14.86,
1042
+ "grad_norm": NaN,
1043
+ "learning_rate": 2.697756301296683e-05,
1044
+ "loss": 0.0,
1045
+ "step": 12900
1046
+ },
1047
+ {
1048
+ "epoch": 14.98,
1049
+ "grad_norm": NaN,
1050
+ "learning_rate": 2.697756301296683e-05,
1051
+ "loss": 0.0,
1052
+ "step": 13000
1053
+ },
1054
+ {
1055
+ "epoch": 15.0,
1056
+ "eval_loss": NaN,
1057
+ "eval_runtime": 109.7539,
1058
+ "eval_samples_per_second": 28.564,
1059
+ "eval_steps_per_second": 3.572,
1060
+ "eval_wer": 1.0,
1061
+ "step": 13018
1062
+ },
1063
+ {
1064
+ "epoch": 15.09,
1065
+ "grad_norm": NaN,
1066
+ "learning_rate": 2.697756301296683e-05,
1067
+ "loss": 0.0,
1068
+ "step": 13100
1069
+ },
1070
+ {
1071
+ "epoch": 15.21,
1072
+ "grad_norm": NaN,
1073
+ "learning_rate": 2.697756301296683e-05,
1074
+ "loss": 0.0,
1075
+ "step": 13200
1076
+ },
1077
+ {
1078
+ "epoch": 15.32,
1079
+ "grad_norm": NaN,
1080
+ "learning_rate": 2.697756301296683e-05,
1081
+ "loss": 0.0,
1082
+ "step": 13300
1083
+ },
1084
+ {
1085
+ "epoch": 15.44,
1086
+ "grad_norm": NaN,
1087
+ "learning_rate": 2.697756301296683e-05,
1088
+ "loss": 0.0,
1089
+ "step": 13400
1090
+ },
1091
+ {
1092
+ "epoch": 15.56,
1093
+ "grad_norm": NaN,
1094
+ "learning_rate": 2.697756301296683e-05,
1095
+ "loss": 0.0,
1096
+ "step": 13500
1097
+ },
1098
+ {
1099
+ "epoch": 15.67,
1100
+ "grad_norm": NaN,
1101
+ "learning_rate": 2.697756301296683e-05,
1102
+ "loss": 0.0,
1103
+ "step": 13600
1104
+ },
1105
+ {
1106
+ "epoch": 15.79,
1107
+ "grad_norm": NaN,
1108
+ "learning_rate": 2.697756301296683e-05,
1109
+ "loss": 0.0,
1110
+ "step": 13700
1111
+ },
1112
+ {
1113
+ "epoch": 15.9,
1114
+ "grad_norm": NaN,
1115
+ "learning_rate": 2.697756301296683e-05,
1116
+ "loss": 0.0,
1117
+ "step": 13800
1118
+ },
1119
+ {
1120
+ "epoch": 16.0,
1121
+ "eval_loss": NaN,
1122
+ "eval_runtime": 108.5954,
1123
+ "eval_samples_per_second": 28.869,
1124
+ "eval_steps_per_second": 3.61,
1125
+ "eval_wer": 1.0,
1126
+ "step": 13886
1127
+ },
1128
+ {
1129
+ "epoch": 16.0,
1130
+ "step": 13886,
1131
+ "total_flos": 6.139205851819624e+20,
1132
+ "train_loss": 103989713142.67763,
1133
+ "train_runtime": 155022.8715,
1134
+ "train_samples_per_second": 53.744,
1135
+ "train_steps_per_second": 0.28
1136
+ }
1137
+ ],
1138
+ "logging_steps": 100,
1139
+ "max_steps": 43350,
1140
+ "num_input_tokens_seen": 0,
1141
+ "num_train_epochs": 50,
1142
+ "save_steps": 500,
1143
+ "total_flos": 6.139205851819624e+20,
1144
+ "train_batch_size": 8,
1145
+ "trial_name": null,
1146
+ "trial_params": null
1147
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f07d08380f5593b7f8241aafe43b713d22c2c6ee62cc83cc34439f35bf4f3dc
3
+ size 4984
vocab.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"<pad>": 0, "|": 1, "<unk>": 2, "a": 3, "b": 4, "c": 5, "d": 6, "e": 7, "f": 8, "g": 9, "h": 10, "i": 11, "j": 12, "k": 13, "l": 14, "m": 15, "n": 16, "o": 17, "p": 18, "q": 19, "r": 20, "s": 21, "t": 22, "u": 23, "v": 24, "w": 25, "x": 26, "y": 27, "z": 28, "ç": 29, "ã": 30, "à": 31, "á": 32, "â": 33, "ê": 34, "é": 35, "í": 36, "ó": 37, "ô": 38, "õ": 39, "ú": 40, "û": 41, "-": 42, "<s>": 43, "</s>": 44}