orendar commited on
Commit
f82cf38
1 Parent(s): 4c6d5cc

Update from ec2-user

Browse files
README.md ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ - he
5
+ tags:
6
+ - generated_from_trainer
7
+ metrics:
8
+ - bleu
9
+ model-index:
10
+ - name: output_large
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # output_large
18
+
19
+ This model is a fine-tuned version of [/home/ec2-user/SageMaker/marian_large](https://huggingface.co//home/ec2-user/SageMaker/marian_large) on the None dataset.
20
+ It achieves the following results on the evaluation set:
21
+ - Loss: 1.5473
22
+ - Bleu: 32.6637
23
+ - Gen Len: 64.8596
24
+
25
+ ## Model description
26
+
27
+ More information needed
28
+
29
+ ## Intended uses & limitations
30
+
31
+ More information needed
32
+
33
+ ## Training and evaluation data
34
+
35
+ More information needed
36
+
37
+ ## Training procedure
38
+
39
+ ### Training hyperparameters
40
+
41
+ The following hyperparameters were used during training:
42
+ - learning_rate: 5e-05
43
+ - train_batch_size: 16
44
+ - eval_batch_size: 16
45
+ - seed: 42
46
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
47
+ - lr_scheduler_type: linear
48
+ - num_epochs: 10.0
49
+ - mixed_precision_training: Native AMP
50
+
51
+ ### Training results
52
+
53
+ | Training Loss | Epoch | Step | Validation Loss | Bleu | Gen Len |
54
+ |:-------------:|:-----:|:-------:|:---------------:|:-------:|:-------:|
55
+ | 1.7986 | 1.0 | 565689 | 1.8566 | 28.7509 | 66.1132 |
56
+ | 1.6695 | 2.0 | 1131378 | 1.7653 | 29.525 | 66.2651 |
57
+ | 1.6038 | 3.0 | 1697067 | 1.7081 | 29.8841 | 66.1849 |
58
+ | 1.5515 | 4.0 | 2262756 | 1.6601 | 30.588 | 65.9093 |
59
+ | 1.5115 | 5.0 | 2828445 | 1.6359 | 30.9726 | 66.2171 |
60
+ | 1.474 | 6.0 | 3394134 | 1.6097 | 31.3244 | 66.1843 |
61
+ | 1.4425 | 7.0 | 3959823 | 1.5914 | 31.557 | 66.1481 |
62
+ | 1.4063 | 8.0 | 4525512 | 1.5666 | 32.0886 | 65.8595 |
63
+ | 1.3724 | 9.0 | 5091201 | 1.5537 | 32.3644 | 66.1648 |
64
+ | 1.3452 | 10.0 | 5656890 | 1.5473 | 32.4724 | 66.1539 |
65
+
66
+
67
+ ### Framework versions
68
+
69
+ - Transformers 4.12.0.dev0
70
+ - Pytorch 1.9.1+cu102
71
+ - Datasets 1.12.1
72
+ - Tokenizers 0.10.3
all_results.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "eval_bleu": 32.6637,
4
+ "eval_gen_len": 64.8596,
5
+ "eval_loss": 1.5473366975784302,
6
+ "eval_runtime": 1365.0004,
7
+ "eval_samples": 17165,
8
+ "eval_samples_per_second": 12.575,
9
+ "eval_steps_per_second": 0.786,
10
+ "train_loss": 1.5652431253911159,
11
+ "train_runtime": 1147827.3858,
12
+ "train_samples": 9051022,
13
+ "train_samples_per_second": 78.854,
14
+ "train_steps_per_second": 4.928
15
+ }
config.json ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/home/ec2-user/SageMaker/marian_large",
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "swish",
5
+ "add_bias_logits": false,
6
+ "add_final_layer_norm": false,
7
+ "architectures": [
8
+ "MarianMTModel"
9
+ ],
10
+ "attention_dropout": 0.0,
11
+ "bad_words_ids": [
12
+ [
13
+ 62954
14
+ ]
15
+ ],
16
+ "bos_token_id": 0,
17
+ "classifier_dropout": 0.0,
18
+ "d_model": 768,
19
+ "decoder_attention_heads": 12,
20
+ "decoder_ffn_dim": 4096,
21
+ "decoder_layerdrop": 0.0,
22
+ "decoder_layers": 9,
23
+ "decoder_start_token_id": 62954,
24
+ "do_blenderbot_90_layernorm": false,
25
+ "dropout": 0.1,
26
+ "encoder_attention_heads": 12,
27
+ "encoder_ffn_dim": 4096,
28
+ "encoder_layerdrop": 0.0,
29
+ "encoder_layers": 9,
30
+ "eos_token_id": 0,
31
+ "extra_pos_embeddings": 2,
32
+ "force_bos_token_to_be_generated": false,
33
+ "forced_eos_token_id": 0,
34
+ "id2label": {
35
+ "0": "LABEL_0",
36
+ "1": "LABEL_1",
37
+ "2": "LABEL_2"
38
+ },
39
+ "init_std": 0.02,
40
+ "is_encoder_decoder": true,
41
+ "label2id": {
42
+ "LABEL_0": 0,
43
+ "LABEL_1": 1,
44
+ "LABEL_2": 2
45
+ },
46
+ "max_length": 512,
47
+ "max_position_embeddings": 512,
48
+ "model_type": "marian",
49
+ "normalize_before": false,
50
+ "normalize_embedding": false,
51
+ "num_beams": 6,
52
+ "num_hidden_layers": 9,
53
+ "pad_token_id": 62954,
54
+ "scale_embedding": true,
55
+ "static_position_embeddings": true,
56
+ "torch_dtype": "float32",
57
+ "transformers_version": "4.12.0.dev0",
58
+ "use_cache": true,
59
+ "vocab_size": 62955
60
+ }
eval_results.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "eval_bleu": 32.6637,
4
+ "eval_gen_len": 64.8596,
5
+ "eval_loss": 1.5473366975784302,
6
+ "eval_runtime": 1365.0004,
7
+ "eval_samples": 17165,
8
+ "eval_samples_per_second": 12.575,
9
+ "eval_steps_per_second": 0.786
10
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dced33fde86920378b4a0189ee91983072e2d712154c1bc15dabc9463ea4dfa4
3
+ size 902523465
source.spm ADDED
Binary file (797 kB). View file
 
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
target.spm ADDED
Binary file (885 kB). View file
 
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"source_lang": null, "target_lang": null, "unk_token": "<unk>", "eos_token": "</s>", "pad_token": "<pad>", "model_max_length": 512, "sp_model_kwargs": {}, "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "/home/ec2-user/SageMaker/marian_large", "tokenizer_class": "MarianTokenizer"}
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "train_loss": 1.5652431253911159,
4
+ "train_runtime": 1147827.3858,
5
+ "train_samples": 9051022,
6
+ "train_samples_per_second": 78.854,
7
+ "train_steps_per_second": 4.928
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,3515 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 10.0,
5
+ "global_step": 5656890,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.02,
12
+ "learning_rate": 4.991162104972874e-05,
13
+ "loss": 5.8101,
14
+ "step": 10000
15
+ },
16
+ {
17
+ "epoch": 0.04,
18
+ "learning_rate": 4.982326861579419e-05,
19
+ "loss": 4.147,
20
+ "step": 20000
21
+ },
22
+ {
23
+ "epoch": 0.05,
24
+ "learning_rate": 4.973490734308074e-05,
25
+ "loss": 3.446,
26
+ "step": 30000
27
+ },
28
+ {
29
+ "epoch": 0.07,
30
+ "learning_rate": 4.964655490914619e-05,
31
+ "loss": 3.0968,
32
+ "step": 40000
33
+ },
34
+ {
35
+ "epoch": 0.09,
36
+ "learning_rate": 4.9558193636432744e-05,
37
+ "loss": 2.8867,
38
+ "step": 50000
39
+ },
40
+ {
41
+ "epoch": 0.11,
42
+ "learning_rate": 4.94698500412771e-05,
43
+ "loss": 2.7374,
44
+ "step": 60000
45
+ },
46
+ {
47
+ "epoch": 0.12,
48
+ "learning_rate": 4.938149760734255e-05,
49
+ "loss": 2.6213,
50
+ "step": 70000
51
+ },
52
+ {
53
+ "epoch": 0.14,
54
+ "learning_rate": 4.92931363346291e-05,
55
+ "loss": 2.5359,
56
+ "step": 80000
57
+ },
58
+ {
59
+ "epoch": 0.16,
60
+ "learning_rate": 4.920478390069455e-05,
61
+ "loss": 2.4665,
62
+ "step": 90000
63
+ },
64
+ {
65
+ "epoch": 0.18,
66
+ "learning_rate": 4.91164226279811e-05,
67
+ "loss": 2.4067,
68
+ "step": 100000
69
+ },
70
+ {
71
+ "epoch": 0.19,
72
+ "learning_rate": 4.902807019404656e-05,
73
+ "loss": 2.3518,
74
+ "step": 110000
75
+ },
76
+ {
77
+ "epoch": 0.21,
78
+ "learning_rate": 4.893971776011201e-05,
79
+ "loss": 2.311,
80
+ "step": 120000
81
+ },
82
+ {
83
+ "epoch": 0.23,
84
+ "learning_rate": 4.885134764861965e-05,
85
+ "loss": 2.2728,
86
+ "step": 130000
87
+ },
88
+ {
89
+ "epoch": 0.25,
90
+ "learning_rate": 4.87629952146851e-05,
91
+ "loss": 2.2382,
92
+ "step": 140000
93
+ },
94
+ {
95
+ "epoch": 0.27,
96
+ "learning_rate": 4.867464278075056e-05,
97
+ "loss": 2.2096,
98
+ "step": 150000
99
+ },
100
+ {
101
+ "epoch": 0.28,
102
+ "learning_rate": 4.858629034681601e-05,
103
+ "loss": 2.178,
104
+ "step": 160000
105
+ },
106
+ {
107
+ "epoch": 0.3,
108
+ "learning_rate": 4.849793791288146e-05,
109
+ "loss": 2.1548,
110
+ "step": 170000
111
+ },
112
+ {
113
+ "epoch": 0.32,
114
+ "learning_rate": 4.840958547894692e-05,
115
+ "loss": 2.1315,
116
+ "step": 180000
117
+ },
118
+ {
119
+ "epoch": 0.34,
120
+ "learning_rate": 4.8321233045012365e-05,
121
+ "loss": 2.1091,
122
+ "step": 190000
123
+ },
124
+ {
125
+ "epoch": 0.35,
126
+ "learning_rate": 4.823288061107782e-05,
127
+ "loss": 2.0882,
128
+ "step": 200000
129
+ },
130
+ {
131
+ "epoch": 0.37,
132
+ "learning_rate": 4.814452817714327e-05,
133
+ "loss": 2.0706,
134
+ "step": 210000
135
+ },
136
+ {
137
+ "epoch": 0.39,
138
+ "learning_rate": 4.805618458198763e-05,
139
+ "loss": 2.0587,
140
+ "step": 220000
141
+ },
142
+ {
143
+ "epoch": 0.41,
144
+ "learning_rate": 4.7967832148053085e-05,
145
+ "loss": 2.0406,
146
+ "step": 230000
147
+ },
148
+ {
149
+ "epoch": 0.42,
150
+ "learning_rate": 4.787947971411853e-05,
151
+ "loss": 2.0289,
152
+ "step": 240000
153
+ },
154
+ {
155
+ "epoch": 0.44,
156
+ "learning_rate": 4.779112728018399e-05,
157
+ "loss": 2.013,
158
+ "step": 250000
159
+ },
160
+ {
161
+ "epoch": 0.46,
162
+ "learning_rate": 4.770276600747054e-05,
163
+ "loss": 1.9989,
164
+ "step": 260000
165
+ },
166
+ {
167
+ "epoch": 0.48,
168
+ "learning_rate": 4.7614413573535995e-05,
169
+ "loss": 1.9863,
170
+ "step": 270000
171
+ },
172
+ {
173
+ "epoch": 0.49,
174
+ "learning_rate": 4.752606997838035e-05,
175
+ "loss": 1.9783,
176
+ "step": 280000
177
+ },
178
+ {
179
+ "epoch": 0.51,
180
+ "learning_rate": 4.7437717544445804e-05,
181
+ "loss": 1.9657,
182
+ "step": 290000
183
+ },
184
+ {
185
+ "epoch": 0.53,
186
+ "learning_rate": 4.734935627173235e-05,
187
+ "loss": 1.9566,
188
+ "step": 300000
189
+ },
190
+ {
191
+ "epoch": 0.55,
192
+ "learning_rate": 4.72610038377978e-05,
193
+ "loss": 1.9453,
194
+ "step": 310000
195
+ },
196
+ {
197
+ "epoch": 0.57,
198
+ "learning_rate": 4.7172660242642156e-05,
199
+ "loss": 1.9367,
200
+ "step": 320000
201
+ },
202
+ {
203
+ "epoch": 0.58,
204
+ "learning_rate": 4.708430780870761e-05,
205
+ "loss": 1.9306,
206
+ "step": 330000
207
+ },
208
+ {
209
+ "epoch": 0.6,
210
+ "learning_rate": 4.699595537477307e-05,
211
+ "loss": 1.9191,
212
+ "step": 340000
213
+ },
214
+ {
215
+ "epoch": 0.62,
216
+ "learning_rate": 4.690761177961742e-05,
217
+ "loss": 1.9128,
218
+ "step": 350000
219
+ },
220
+ {
221
+ "epoch": 0.64,
222
+ "learning_rate": 4.6819259345682876e-05,
223
+ "loss": 1.9027,
224
+ "step": 360000
225
+ },
226
+ {
227
+ "epoch": 0.65,
228
+ "learning_rate": 4.673090691174833e-05,
229
+ "loss": 1.8961,
230
+ "step": 370000
231
+ },
232
+ {
233
+ "epoch": 0.67,
234
+ "learning_rate": 4.664254563903488e-05,
235
+ "loss": 1.8879,
236
+ "step": 380000
237
+ },
238
+ {
239
+ "epoch": 0.69,
240
+ "learning_rate": 4.6554202043879235e-05,
241
+ "loss": 1.8775,
242
+ "step": 390000
243
+ },
244
+ {
245
+ "epoch": 0.71,
246
+ "learning_rate": 4.646584960994469e-05,
247
+ "loss": 1.8793,
248
+ "step": 400000
249
+ },
250
+ {
251
+ "epoch": 0.72,
252
+ "learning_rate": 4.6377506014789044e-05,
253
+ "loss": 1.8719,
254
+ "step": 410000
255
+ },
256
+ {
257
+ "epoch": 0.74,
258
+ "learning_rate": 4.6289171258412315e-05,
259
+ "loss": 1.8633,
260
+ "step": 420000
261
+ },
262
+ {
263
+ "epoch": 0.76,
264
+ "learning_rate": 4.6200818824477763e-05,
265
+ "loss": 1.8573,
266
+ "step": 430000
267
+ },
268
+ {
269
+ "epoch": 0.78,
270
+ "learning_rate": 4.611245755176431e-05,
271
+ "loss": 1.8464,
272
+ "step": 440000
273
+ },
274
+ {
275
+ "epoch": 0.8,
276
+ "learning_rate": 4.602410511782976e-05,
277
+ "loss": 1.8467,
278
+ "step": 450000
279
+ },
280
+ {
281
+ "epoch": 0.81,
282
+ "learning_rate": 4.593575268389522e-05,
283
+ "loss": 1.8403,
284
+ "step": 460000
285
+ },
286
+ {
287
+ "epoch": 0.83,
288
+ "learning_rate": 4.584740024996067e-05,
289
+ "loss": 1.836,
290
+ "step": 470000
291
+ },
292
+ {
293
+ "epoch": 0.85,
294
+ "learning_rate": 4.575906549358393e-05,
295
+ "loss": 1.8332,
296
+ "step": 480000
297
+ },
298
+ {
299
+ "epoch": 0.87,
300
+ "learning_rate": 4.567071305964939e-05,
301
+ "loss": 1.8271,
302
+ "step": 490000
303
+ },
304
+ {
305
+ "epoch": 0.88,
306
+ "learning_rate": 4.5582360625714835e-05,
307
+ "loss": 1.8172,
308
+ "step": 500000
309
+ },
310
+ {
311
+ "epoch": 0.9,
312
+ "learning_rate": 4.5494017030559195e-05,
313
+ "loss": 1.8169,
314
+ "step": 510000
315
+ },
316
+ {
317
+ "epoch": 0.92,
318
+ "learning_rate": 4.540566459662465e-05,
319
+ "loss": 1.8168,
320
+ "step": 520000
321
+ },
322
+ {
323
+ "epoch": 0.94,
324
+ "learning_rate": 4.53173121626901e-05,
325
+ "loss": 1.8088,
326
+ "step": 530000
327
+ },
328
+ {
329
+ "epoch": 0.95,
330
+ "learning_rate": 4.522896856753446e-05,
331
+ "loss": 1.801,
332
+ "step": 540000
333
+ },
334
+ {
335
+ "epoch": 0.97,
336
+ "learning_rate": 4.514062497237882e-05,
337
+ "loss": 1.8026,
338
+ "step": 550000
339
+ },
340
+ {
341
+ "epoch": 0.99,
342
+ "learning_rate": 4.5052272538444275e-05,
343
+ "loss": 1.7986,
344
+ "step": 560000
345
+ },
346
+ {
347
+ "epoch": 1.0,
348
+ "eval_bleu": 28.7509,
349
+ "eval_gen_len": 66.1132,
350
+ "eval_loss": 1.8565547466278076,
351
+ "eval_runtime": 2948.752,
352
+ "eval_samples_per_second": 5.821,
353
+ "eval_steps_per_second": 0.364,
354
+ "step": 565689
355
+ },
356
+ {
357
+ "epoch": 1.01,
358
+ "learning_rate": 4.496392894328863e-05,
359
+ "loss": 1.7781,
360
+ "step": 570000
361
+ },
362
+ {
363
+ "epoch": 1.03,
364
+ "learning_rate": 4.487557650935408e-05,
365
+ "loss": 1.7622,
366
+ "step": 580000
367
+ },
368
+ {
369
+ "epoch": 1.04,
370
+ "learning_rate": 4.478722407541954e-05,
371
+ "loss": 1.7623,
372
+ "step": 590000
373
+ },
374
+ {
375
+ "epoch": 1.06,
376
+ "learning_rate": 4.469887164148499e-05,
377
+ "loss": 1.7613,
378
+ "step": 600000
379
+ },
380
+ {
381
+ "epoch": 1.08,
382
+ "learning_rate": 4.461052804632935e-05,
383
+ "loss": 1.7595,
384
+ "step": 610000
385
+ },
386
+ {
387
+ "epoch": 1.1,
388
+ "learning_rate": 4.45221756123948e-05,
389
+ "loss": 1.7563,
390
+ "step": 620000
391
+ },
392
+ {
393
+ "epoch": 1.11,
394
+ "learning_rate": 4.443382317846025e-05,
395
+ "loss": 1.7571,
396
+ "step": 630000
397
+ },
398
+ {
399
+ "epoch": 1.13,
400
+ "learning_rate": 4.434547958330461e-05,
401
+ "loss": 1.7521,
402
+ "step": 640000
403
+ },
404
+ {
405
+ "epoch": 1.15,
406
+ "learning_rate": 4.4257127149370066e-05,
407
+ "loss": 1.7513,
408
+ "step": 650000
409
+ },
410
+ {
411
+ "epoch": 1.17,
412
+ "learning_rate": 4.4168774715435515e-05,
413
+ "loss": 1.7471,
414
+ "step": 660000
415
+ },
416
+ {
417
+ "epoch": 1.18,
418
+ "learning_rate": 4.4080422281500964e-05,
419
+ "loss": 1.7532,
420
+ "step": 670000
421
+ },
422
+ {
423
+ "epoch": 1.2,
424
+ "learning_rate": 4.399207868634533e-05,
425
+ "loss": 1.7469,
426
+ "step": 680000
427
+ },
428
+ {
429
+ "epoch": 1.22,
430
+ "learning_rate": 4.390372625241078e-05,
431
+ "loss": 1.7447,
432
+ "step": 690000
433
+ },
434
+ {
435
+ "epoch": 1.24,
436
+ "learning_rate": 4.381537381847623e-05,
437
+ "loss": 1.7385,
438
+ "step": 700000
439
+ },
440
+ {
441
+ "epoch": 1.26,
442
+ "learning_rate": 4.372702138454168e-05,
443
+ "loss": 1.7426,
444
+ "step": 710000
445
+ },
446
+ {
447
+ "epoch": 1.27,
448
+ "learning_rate": 4.363867778938604e-05,
449
+ "loss": 1.7381,
450
+ "step": 720000
451
+ },
452
+ {
453
+ "epoch": 1.29,
454
+ "learning_rate": 4.355032535545149e-05,
455
+ "loss": 1.7338,
456
+ "step": 730000
457
+ },
458
+ {
459
+ "epoch": 1.31,
460
+ "learning_rate": 4.346197292151695e-05,
461
+ "loss": 1.7307,
462
+ "step": 740000
463
+ },
464
+ {
465
+ "epoch": 1.33,
466
+ "learning_rate": 4.33736204875824e-05,
467
+ "loss": 1.7319,
468
+ "step": 750000
469
+ },
470
+ {
471
+ "epoch": 1.34,
472
+ "learning_rate": 4.328525921486895e-05,
473
+ "loss": 1.7279,
474
+ "step": 760000
475
+ },
476
+ {
477
+ "epoch": 1.36,
478
+ "learning_rate": 4.3196915619713307e-05,
479
+ "loss": 1.729,
480
+ "step": 770000
481
+ },
482
+ {
483
+ "epoch": 1.38,
484
+ "learning_rate": 4.310856318577876e-05,
485
+ "loss": 1.7254,
486
+ "step": 780000
487
+ },
488
+ {
489
+ "epoch": 1.4,
490
+ "learning_rate": 4.3020201913065306e-05,
491
+ "loss": 1.7207,
492
+ "step": 790000
493
+ },
494
+ {
495
+ "epoch": 1.41,
496
+ "learning_rate": 4.293184947913076e-05,
497
+ "loss": 1.7183,
498
+ "step": 800000
499
+ },
500
+ {
501
+ "epoch": 1.43,
502
+ "learning_rate": 4.284350588397512e-05,
503
+ "loss": 1.7184,
504
+ "step": 810000
505
+ },
506
+ {
507
+ "epoch": 1.45,
508
+ "learning_rate": 4.275515345004057e-05,
509
+ "loss": 1.7139,
510
+ "step": 820000
511
+ },
512
+ {
513
+ "epoch": 1.47,
514
+ "learning_rate": 4.266679217732712e-05,
515
+ "loss": 1.7155,
516
+ "step": 830000
517
+ },
518
+ {
519
+ "epoch": 1.48,
520
+ "learning_rate": 4.257843090461367e-05,
521
+ "loss": 1.7122,
522
+ "step": 840000
523
+ },
524
+ {
525
+ "epoch": 1.5,
526
+ "learning_rate": 4.2490087309458025e-05,
527
+ "loss": 1.7111,
528
+ "step": 850000
529
+ },
530
+ {
531
+ "epoch": 1.52,
532
+ "learning_rate": 4.2401717197965665e-05,
533
+ "loss": 1.7106,
534
+ "step": 860000
535
+ },
536
+ {
537
+ "epoch": 1.54,
538
+ "learning_rate": 4.2313373602810025e-05,
539
+ "loss": 1.7071,
540
+ "step": 870000
541
+ },
542
+ {
543
+ "epoch": 1.56,
544
+ "learning_rate": 4.222502116887548e-05,
545
+ "loss": 1.7081,
546
+ "step": 880000
547
+ },
548
+ {
549
+ "epoch": 1.57,
550
+ "learning_rate": 4.213666873494093e-05,
551
+ "loss": 1.7053,
552
+ "step": 890000
553
+ },
554
+ {
555
+ "epoch": 1.59,
556
+ "learning_rate": 4.2048316301006384e-05,
557
+ "loss": 1.7023,
558
+ "step": 900000
559
+ },
560
+ {
561
+ "epoch": 1.61,
562
+ "learning_rate": 4.195996386707184e-05,
563
+ "loss": 1.7017,
564
+ "step": 910000
565
+ },
566
+ {
567
+ "epoch": 1.63,
568
+ "learning_rate": 4.1871611433137295e-05,
569
+ "loss": 1.6979,
570
+ "step": 920000
571
+ },
572
+ {
573
+ "epoch": 1.64,
574
+ "learning_rate": 4.178325016042384e-05,
575
+ "loss": 1.6953,
576
+ "step": 930000
577
+ },
578
+ {
579
+ "epoch": 1.66,
580
+ "learning_rate": 4.169489772648929e-05,
581
+ "loss": 1.693,
582
+ "step": 940000
583
+ },
584
+ {
585
+ "epoch": 1.68,
586
+ "learning_rate": 4.160653645377584e-05,
587
+ "loss": 1.6934,
588
+ "step": 950000
589
+ },
590
+ {
591
+ "epoch": 1.7,
592
+ "learning_rate": 4.1518184019841294e-05,
593
+ "loss": 1.6899,
594
+ "step": 960000
595
+ },
596
+ {
597
+ "epoch": 1.71,
598
+ "learning_rate": 4.142983158590675e-05,
599
+ "loss": 1.6913,
600
+ "step": 970000
601
+ },
602
+ {
603
+ "epoch": 1.73,
604
+ "learning_rate": 4.1341470313193294e-05,
605
+ "loss": 1.691,
606
+ "step": 980000
607
+ },
608
+ {
609
+ "epoch": 1.75,
610
+ "learning_rate": 4.125311787925875e-05,
611
+ "loss": 1.6888,
612
+ "step": 990000
613
+ },
614
+ {
615
+ "epoch": 1.77,
616
+ "learning_rate": 4.11647742841031e-05,
617
+ "loss": 1.6888,
618
+ "step": 1000000
619
+ },
620
+ {
621
+ "epoch": 1.79,
622
+ "learning_rate": 4.107641301138965e-05,
623
+ "loss": 1.6857,
624
+ "step": 1010000
625
+ },
626
+ {
627
+ "epoch": 1.8,
628
+ "learning_rate": 4.098806057745511e-05,
629
+ "loss": 1.6854,
630
+ "step": 1020000
631
+ },
632
+ {
633
+ "epoch": 1.82,
634
+ "learning_rate": 4.089970814352056e-05,
635
+ "loss": 1.679,
636
+ "step": 1030000
637
+ },
638
+ {
639
+ "epoch": 1.84,
640
+ "learning_rate": 4.08113468708071e-05,
641
+ "loss": 1.68,
642
+ "step": 1040000
643
+ },
644
+ {
645
+ "epoch": 1.86,
646
+ "learning_rate": 4.072299443687256e-05,
647
+ "loss": 1.681,
648
+ "step": 1050000
649
+ },
650
+ {
651
+ "epoch": 1.87,
652
+ "learning_rate": 4.063464200293801e-05,
653
+ "loss": 1.6783,
654
+ "step": 1060000
655
+ },
656
+ {
657
+ "epoch": 1.89,
658
+ "learning_rate": 4.0546298407782365e-05,
659
+ "loss": 1.6766,
660
+ "step": 1070000
661
+ },
662
+ {
663
+ "epoch": 1.91,
664
+ "learning_rate": 4.0457937135068916e-05,
665
+ "loss": 1.6762,
666
+ "step": 1080000
667
+ },
668
+ {
669
+ "epoch": 1.93,
670
+ "learning_rate": 4.036957586235546e-05,
671
+ "loss": 1.6753,
672
+ "step": 1090000
673
+ },
674
+ {
675
+ "epoch": 1.94,
676
+ "learning_rate": 4.0281223428420916e-05,
677
+ "loss": 1.671,
678
+ "step": 1100000
679
+ },
680
+ {
681
+ "epoch": 1.96,
682
+ "learning_rate": 4.019287099448637e-05,
683
+ "loss": 1.668,
684
+ "step": 1110000
685
+ },
686
+ {
687
+ "epoch": 1.98,
688
+ "learning_rate": 4.0104527399330724e-05,
689
+ "loss": 1.6676,
690
+ "step": 1120000
691
+ },
692
+ {
693
+ "epoch": 2.0,
694
+ "learning_rate": 4.0016166126617275e-05,
695
+ "loss": 1.6695,
696
+ "step": 1130000
697
+ },
698
+ {
699
+ "epoch": 2.0,
700
+ "eval_bleu": 29.525,
701
+ "eval_gen_len": 66.2651,
702
+ "eval_loss": 1.7653018236160278,
703
+ "eval_runtime": 3037.2133,
704
+ "eval_samples_per_second": 5.652,
705
+ "eval_steps_per_second": 0.353,
706
+ "step": 1131378
707
+ },
708
+ {
709
+ "epoch": 2.02,
710
+ "learning_rate": 3.992781369268273e-05,
711
+ "loss": 1.6357,
712
+ "step": 1140000
713
+ },
714
+ {
715
+ "epoch": 2.03,
716
+ "learning_rate": 3.9839461258748186e-05,
717
+ "loss": 1.6309,
718
+ "step": 1150000
719
+ },
720
+ {
721
+ "epoch": 2.05,
722
+ "learning_rate": 3.975109998603473e-05,
723
+ "loss": 1.6355,
724
+ "step": 1160000
725
+ },
726
+ {
727
+ "epoch": 2.07,
728
+ "learning_rate": 3.966275639087909e-05,
729
+ "loss": 1.6321,
730
+ "step": 1170000
731
+ },
732
+ {
733
+ "epoch": 2.09,
734
+ "learning_rate": 3.9574395118165634e-05,
735
+ "loss": 1.6344,
736
+ "step": 1180000
737
+ },
738
+ {
739
+ "epoch": 2.1,
740
+ "learning_rate": 3.948604268423109e-05,
741
+ "loss": 1.6312,
742
+ "step": 1190000
743
+ },
744
+ {
745
+ "epoch": 2.12,
746
+ "learning_rate": 3.939768141151764e-05,
747
+ "loss": 1.6324,
748
+ "step": 1200000
749
+ },
750
+ {
751
+ "epoch": 2.14,
752
+ "learning_rate": 3.9309328977583096e-05,
753
+ "loss": 1.6357,
754
+ "step": 1210000
755
+ },
756
+ {
757
+ "epoch": 2.16,
758
+ "learning_rate": 3.922097654364854e-05,
759
+ "loss": 1.6325,
760
+ "step": 1220000
761
+ },
762
+ {
763
+ "epoch": 2.17,
764
+ "learning_rate": 3.913262410971399e-05,
765
+ "loss": 1.6327,
766
+ "step": 1230000
767
+ },
768
+ {
769
+ "epoch": 2.19,
770
+ "learning_rate": 3.9044262837000544e-05,
771
+ "loss": 1.632,
772
+ "step": 1240000
773
+ },
774
+ {
775
+ "epoch": 2.21,
776
+ "learning_rate": 3.8955910403066e-05,
777
+ "loss": 1.6315,
778
+ "step": 1250000
779
+ },
780
+ {
781
+ "epoch": 2.23,
782
+ "learning_rate": 3.8867557969131455e-05,
783
+ "loss": 1.6322,
784
+ "step": 1260000
785
+ },
786
+ {
787
+ "epoch": 2.25,
788
+ "learning_rate": 3.877921437397581e-05,
789
+ "loss": 1.6314,
790
+ "step": 1270000
791
+ },
792
+ {
793
+ "epoch": 2.26,
794
+ "learning_rate": 3.8690861940041264e-05,
795
+ "loss": 1.6317,
796
+ "step": 1280000
797
+ },
798
+ {
799
+ "epoch": 2.28,
800
+ "learning_rate": 3.860250066732781e-05,
801
+ "loss": 1.6275,
802
+ "step": 1290000
803
+ },
804
+ {
805
+ "epoch": 2.3,
806
+ "learning_rate": 3.851415707217217e-05,
807
+ "loss": 1.6296,
808
+ "step": 1300000
809
+ },
810
+ {
811
+ "epoch": 2.32,
812
+ "learning_rate": 3.842579579945871e-05,
813
+ "loss": 1.6305,
814
+ "step": 1310000
815
+ },
816
+ {
817
+ "epoch": 2.33,
818
+ "learning_rate": 3.833743452674526e-05,
819
+ "loss": 1.6266,
820
+ "step": 1320000
821
+ },
822
+ {
823
+ "epoch": 2.35,
824
+ "learning_rate": 3.824908209281072e-05,
825
+ "loss": 1.6273,
826
+ "step": 1330000
827
+ },
828
+ {
829
+ "epoch": 2.37,
830
+ "learning_rate": 3.8160729658876174e-05,
831
+ "loss": 1.6283,
832
+ "step": 1340000
833
+ },
834
+ {
835
+ "epoch": 2.39,
836
+ "learning_rate": 3.807237722494162e-05,
837
+ "loss": 1.629,
838
+ "step": 1350000
839
+ },
840
+ {
841
+ "epoch": 2.4,
842
+ "learning_rate": 3.798403362978598e-05,
843
+ "loss": 1.6269,
844
+ "step": 1360000
845
+ },
846
+ {
847
+ "epoch": 2.42,
848
+ "learning_rate": 3.7895672357072526e-05,
849
+ "loss": 1.6252,
850
+ "step": 1370000
851
+ },
852
+ {
853
+ "epoch": 2.44,
854
+ "learning_rate": 3.7807328761916886e-05,
855
+ "loss": 1.6221,
856
+ "step": 1380000
857
+ },
858
+ {
859
+ "epoch": 2.46,
860
+ "learning_rate": 3.7718976327982335e-05,
861
+ "loss": 1.6229,
862
+ "step": 1390000
863
+ },
864
+ {
865
+ "epoch": 2.47,
866
+ "learning_rate": 3.7630615055268886e-05,
867
+ "loss": 1.6226,
868
+ "step": 1400000
869
+ },
870
+ {
871
+ "epoch": 2.49,
872
+ "learning_rate": 3.754226262133434e-05,
873
+ "loss": 1.6224,
874
+ "step": 1410000
875
+ },
876
+ {
877
+ "epoch": 2.51,
878
+ "learning_rate": 3.74539101873998e-05,
879
+ "loss": 1.6207,
880
+ "step": 1420000
881
+ },
882
+ {
883
+ "epoch": 2.53,
884
+ "learning_rate": 3.7365557753465245e-05,
885
+ "loss": 1.6204,
886
+ "step": 1430000
887
+ },
888
+ {
889
+ "epoch": 2.55,
890
+ "learning_rate": 3.72772053195307e-05,
891
+ "loss": 1.6183,
892
+ "step": 1440000
893
+ },
894
+ {
895
+ "epoch": 2.56,
896
+ "learning_rate": 3.7188861724375054e-05,
897
+ "loss": 1.6235,
898
+ "step": 1450000
899
+ },
900
+ {
901
+ "epoch": 2.58,
902
+ "learning_rate": 3.710050929044051e-05,
903
+ "loss": 1.6207,
904
+ "step": 1460000
905
+ },
906
+ {
907
+ "epoch": 2.6,
908
+ "learning_rate": 3.7012156856505965e-05,
909
+ "loss": 1.618,
910
+ "step": 1470000
911
+ },
912
+ {
913
+ "epoch": 2.62,
914
+ "learning_rate": 3.692381326135032e-05,
915
+ "loss": 1.6162,
916
+ "step": 1480000
917
+ },
918
+ {
919
+ "epoch": 2.63,
920
+ "learning_rate": 3.683546082741577e-05,
921
+ "loss": 1.6194,
922
+ "step": 1490000
923
+ },
924
+ {
925
+ "epoch": 2.65,
926
+ "learning_rate": 3.674711723226013e-05,
927
+ "loss": 1.6154,
928
+ "step": 1500000
929
+ },
930
+ {
931
+ "epoch": 2.67,
932
+ "learning_rate": 3.6658755959546684e-05,
933
+ "loss": 1.6156,
934
+ "step": 1510000
935
+ },
936
+ {
937
+ "epoch": 2.69,
938
+ "learning_rate": 3.6570403525612126e-05,
939
+ "loss": 1.6144,
940
+ "step": 1520000
941
+ },
942
+ {
943
+ "epoch": 2.7,
944
+ "learning_rate": 3.648205993045649e-05,
945
+ "loss": 1.6125,
946
+ "step": 1530000
947
+ },
948
+ {
949
+ "epoch": 2.72,
950
+ "learning_rate": 3.639369865774304e-05,
951
+ "loss": 1.6134,
952
+ "step": 1540000
953
+ },
954
+ {
955
+ "epoch": 2.74,
956
+ "learning_rate": 3.630534622380849e-05,
957
+ "loss": 1.6102,
958
+ "step": 1550000
959
+ },
960
+ {
961
+ "epoch": 2.76,
962
+ "learning_rate": 3.6217002628652845e-05,
963
+ "loss": 1.6065,
964
+ "step": 1560000
965
+ },
966
+ {
967
+ "epoch": 2.78,
968
+ "learning_rate": 3.6128641355939396e-05,
969
+ "loss": 1.6073,
970
+ "step": 1570000
971
+ },
972
+ {
973
+ "epoch": 2.79,
974
+ "learning_rate": 3.604028008322594e-05,
975
+ "loss": 1.6071,
976
+ "step": 1580000
977
+ },
978
+ {
979
+ "epoch": 2.81,
980
+ "learning_rate": 3.59519364880703e-05,
981
+ "loss": 1.6046,
982
+ "step": 1590000
983
+ },
984
+ {
985
+ "epoch": 2.83,
986
+ "learning_rate": 3.586359289291466e-05,
987
+ "loss": 1.607,
988
+ "step": 1600000
989
+ },
990
+ {
991
+ "epoch": 2.85,
992
+ "learning_rate": 3.5775231620201204e-05,
993
+ "loss": 1.6069,
994
+ "step": 1610000
995
+ },
996
+ {
997
+ "epoch": 2.86,
998
+ "learning_rate": 3.5686888025045564e-05,
999
+ "loss": 1.6058,
1000
+ "step": 1620000
1001
+ },
1002
+ {
1003
+ "epoch": 2.88,
1004
+ "learning_rate": 3.5598526752332115e-05,
1005
+ "loss": 1.6031,
1006
+ "step": 1630000
1007
+ },
1008
+ {
1009
+ "epoch": 2.9,
1010
+ "learning_rate": 3.551018315717647e-05,
1011
+ "loss": 1.6047,
1012
+ "step": 1640000
1013
+ },
1014
+ {
1015
+ "epoch": 2.92,
1016
+ "learning_rate": 3.542182188446302e-05,
1017
+ "loss": 1.6042,
1018
+ "step": 1650000
1019
+ },
1020
+ {
1021
+ "epoch": 2.93,
1022
+ "learning_rate": 3.5333469450528475e-05,
1023
+ "loss": 1.6025,
1024
+ "step": 1660000
1025
+ },
1026
+ {
1027
+ "epoch": 2.95,
1028
+ "learning_rate": 3.524511701659392e-05,
1029
+ "loss": 1.6036,
1030
+ "step": 1670000
1031
+ },
1032
+ {
1033
+ "epoch": 2.97,
1034
+ "learning_rate": 3.515677342143828e-05,
1035
+ "loss": 1.6011,
1036
+ "step": 1680000
1037
+ },
1038
+ {
1039
+ "epoch": 2.99,
1040
+ "learning_rate": 3.506842098750374e-05,
1041
+ "loss": 1.6038,
1042
+ "step": 1690000
1043
+ },
1044
+ {
1045
+ "epoch": 3.0,
1046
+ "eval_bleu": 29.8841,
1047
+ "eval_gen_len": 66.1849,
1048
+ "eval_loss": 1.7081401348114014,
1049
+ "eval_runtime": 2996.6763,
1050
+ "eval_samples_per_second": 5.728,
1051
+ "eval_steps_per_second": 0.358,
1052
+ "step": 1697067
1053
+ },
1054
+ {
1055
+ "epoch": 3.01,
1056
+ "learning_rate": 3.498005971479029e-05,
1057
+ "loss": 1.5881,
1058
+ "step": 1700000
1059
+ },
1060
+ {
1061
+ "epoch": 3.02,
1062
+ "learning_rate": 3.489170728085573e-05,
1063
+ "loss": 1.5596,
1064
+ "step": 1710000
1065
+ },
1066
+ {
1067
+ "epoch": 3.04,
1068
+ "learning_rate": 3.48033636857001e-05,
1069
+ "loss": 1.5643,
1070
+ "step": 1720000
1071
+ },
1072
+ {
1073
+ "epoch": 3.06,
1074
+ "learning_rate": 3.471502009054445e-05,
1075
+ "loss": 1.563,
1076
+ "step": 1730000
1077
+ },
1078
+ {
1079
+ "epoch": 3.08,
1080
+ "learning_rate": 3.462667649538881e-05,
1081
+ "loss": 1.5633,
1082
+ "step": 1740000
1083
+ },
1084
+ {
1085
+ "epoch": 3.09,
1086
+ "learning_rate": 3.4538324061454266e-05,
1087
+ "loss": 1.5678,
1088
+ "step": 1750000
1089
+ },
1090
+ {
1091
+ "epoch": 3.11,
1092
+ "learning_rate": 3.444996278874081e-05,
1093
+ "loss": 1.5677,
1094
+ "step": 1760000
1095
+ },
1096
+ {
1097
+ "epoch": 3.13,
1098
+ "learning_rate": 3.436161919358517e-05,
1099
+ "loss": 1.5664,
1100
+ "step": 1770000
1101
+ },
1102
+ {
1103
+ "epoch": 3.15,
1104
+ "learning_rate": 3.427327559842953e-05,
1105
+ "loss": 1.57,
1106
+ "step": 1780000
1107
+ },
1108
+ {
1109
+ "epoch": 3.16,
1110
+ "learning_rate": 3.418492316449498e-05,
1111
+ "loss": 1.5695,
1112
+ "step": 1790000
1113
+ },
1114
+ {
1115
+ "epoch": 3.18,
1116
+ "learning_rate": 3.4096570730560434e-05,
1117
+ "loss": 1.5693,
1118
+ "step": 1800000
1119
+ },
1120
+ {
1121
+ "epoch": 3.2,
1122
+ "learning_rate": 3.4008227135404794e-05,
1123
+ "loss": 1.5669,
1124
+ "step": 1810000
1125
+ },
1126
+ {
1127
+ "epoch": 3.22,
1128
+ "learning_rate": 3.391987470147024e-05,
1129
+ "loss": 1.5677,
1130
+ "step": 1820000
1131
+ },
1132
+ {
1133
+ "epoch": 3.23,
1134
+ "learning_rate": 3.38315222675357e-05,
1135
+ "loss": 1.5689,
1136
+ "step": 1830000
1137
+ },
1138
+ {
1139
+ "epoch": 3.25,
1140
+ "learning_rate": 3.3743169833601154e-05,
1141
+ "loss": 1.5711,
1142
+ "step": 1840000
1143
+ },
1144
+ {
1145
+ "epoch": 3.27,
1146
+ "learning_rate": 3.36548173996666e-05,
1147
+ "loss": 1.5679,
1148
+ "step": 1850000
1149
+ },
1150
+ {
1151
+ "epoch": 3.29,
1152
+ "learning_rate": 3.3566456126953147e-05,
1153
+ "loss": 1.5705,
1154
+ "step": 1860000
1155
+ },
1156
+ {
1157
+ "epoch": 3.31,
1158
+ "learning_rate": 3.3478112531797506e-05,
1159
+ "loss": 1.5689,
1160
+ "step": 1870000
1161
+ },
1162
+ {
1163
+ "epoch": 3.32,
1164
+ "learning_rate": 3.338976009786296e-05,
1165
+ "loss": 1.5657,
1166
+ "step": 1880000
1167
+ },
1168
+ {
1169
+ "epoch": 3.34,
1170
+ "learning_rate": 3.330139882514951e-05,
1171
+ "loss": 1.5658,
1172
+ "step": 1890000
1173
+ },
1174
+ {
1175
+ "epoch": 3.36,
1176
+ "learning_rate": 3.3213055229993866e-05,
1177
+ "loss": 1.5675,
1178
+ "step": 1900000
1179
+ },
1180
+ {
1181
+ "epoch": 3.38,
1182
+ "learning_rate": 3.312469395728042e-05,
1183
+ "loss": 1.5664,
1184
+ "step": 1910000
1185
+ },
1186
+ {
1187
+ "epoch": 3.39,
1188
+ "learning_rate": 3.303635036212478e-05,
1189
+ "loss": 1.5668,
1190
+ "step": 1920000
1191
+ },
1192
+ {
1193
+ "epoch": 3.41,
1194
+ "learning_rate": 3.2947997928190225e-05,
1195
+ "loss": 1.5653,
1196
+ "step": 1930000
1197
+ },
1198
+ {
1199
+ "epoch": 3.43,
1200
+ "learning_rate": 3.285963665547677e-05,
1201
+ "loss": 1.5677,
1202
+ "step": 1940000
1203
+ },
1204
+ {
1205
+ "epoch": 3.45,
1206
+ "learning_rate": 3.2771293060321136e-05,
1207
+ "loss": 1.5633,
1208
+ "step": 1950000
1209
+ },
1210
+ {
1211
+ "epoch": 3.46,
1212
+ "learning_rate": 3.2682940626386585e-05,
1213
+ "loss": 1.5638,
1214
+ "step": 1960000
1215
+ },
1216
+ {
1217
+ "epoch": 3.48,
1218
+ "learning_rate": 3.2594579353673136e-05,
1219
+ "loss": 1.5635,
1220
+ "step": 1970000
1221
+ },
1222
+ {
1223
+ "epoch": 3.5,
1224
+ "learning_rate": 3.2506226919738584e-05,
1225
+ "loss": 1.566,
1226
+ "step": 1980000
1227
+ },
1228
+ {
1229
+ "epoch": 3.52,
1230
+ "learning_rate": 3.241787448580404e-05,
1231
+ "loss": 1.5641,
1232
+ "step": 1990000
1233
+ },
1234
+ {
1235
+ "epoch": 3.54,
1236
+ "learning_rate": 3.232950437431168e-05,
1237
+ "loss": 1.5634,
1238
+ "step": 2000000
1239
+ },
1240
+ {
1241
+ "epoch": 3.55,
1242
+ "learning_rate": 3.224116077915604e-05,
1243
+ "loss": 1.5658,
1244
+ "step": 2010000
1245
+ },
1246
+ {
1247
+ "epoch": 3.57,
1248
+ "learning_rate": 3.215280834522149e-05,
1249
+ "loss": 1.5644,
1250
+ "step": 2020000
1251
+ },
1252
+ {
1253
+ "epoch": 3.59,
1254
+ "learning_rate": 3.2064455911286943e-05,
1255
+ "loss": 1.5625,
1256
+ "step": 2030000
1257
+ },
1258
+ {
1259
+ "epoch": 3.61,
1260
+ "learning_rate": 3.19761034773524e-05,
1261
+ "loss": 1.562,
1262
+ "step": 2040000
1263
+ },
1264
+ {
1265
+ "epoch": 3.62,
1266
+ "learning_rate": 3.1887751043417854e-05,
1267
+ "loss": 1.5634,
1268
+ "step": 2050000
1269
+ },
1270
+ {
1271
+ "epoch": 3.64,
1272
+ "learning_rate": 3.179940744826221e-05,
1273
+ "loss": 1.5595,
1274
+ "step": 2060000
1275
+ },
1276
+ {
1277
+ "epoch": 3.66,
1278
+ "learning_rate": 3.171104617554876e-05,
1279
+ "loss": 1.5594,
1280
+ "step": 2070000
1281
+ },
1282
+ {
1283
+ "epoch": 3.68,
1284
+ "learning_rate": 3.1622693741614214e-05,
1285
+ "loss": 1.5609,
1286
+ "step": 2080000
1287
+ },
1288
+ {
1289
+ "epoch": 3.69,
1290
+ "learning_rate": 3.153434130767966e-05,
1291
+ "loss": 1.5606,
1292
+ "step": 2090000
1293
+ },
1294
+ {
1295
+ "epoch": 3.71,
1296
+ "learning_rate": 3.144598003496621e-05,
1297
+ "loss": 1.562,
1298
+ "step": 2100000
1299
+ },
1300
+ {
1301
+ "epoch": 3.73,
1302
+ "learning_rate": 3.135762760103166e-05,
1303
+ "loss": 1.5601,
1304
+ "step": 2110000
1305
+ },
1306
+ {
1307
+ "epoch": 3.75,
1308
+ "learning_rate": 3.126927516709712e-05,
1309
+ "loss": 1.5574,
1310
+ "step": 2120000
1311
+ },
1312
+ {
1313
+ "epoch": 3.77,
1314
+ "learning_rate": 3.118092273316257e-05,
1315
+ "loss": 1.557,
1316
+ "step": 2130000
1317
+ },
1318
+ {
1319
+ "epoch": 3.78,
1320
+ "learning_rate": 3.109257029922802e-05,
1321
+ "loss": 1.558,
1322
+ "step": 2140000
1323
+ },
1324
+ {
1325
+ "epoch": 3.8,
1326
+ "learning_rate": 3.100420902651457e-05,
1327
+ "loss": 1.5589,
1328
+ "step": 2150000
1329
+ },
1330
+ {
1331
+ "epoch": 3.82,
1332
+ "learning_rate": 3.0915865431358926e-05,
1333
+ "loss": 1.5563,
1334
+ "step": 2160000
1335
+ },
1336
+ {
1337
+ "epoch": 3.84,
1338
+ "learning_rate": 3.082751299742438e-05,
1339
+ "loss": 1.557,
1340
+ "step": 2170000
1341
+ },
1342
+ {
1343
+ "epoch": 3.85,
1344
+ "learning_rate": 3.073915172471093e-05,
1345
+ "loss": 1.5517,
1346
+ "step": 2180000
1347
+ },
1348
+ {
1349
+ "epoch": 3.87,
1350
+ "learning_rate": 3.0650808129555285e-05,
1351
+ "loss": 1.5569,
1352
+ "step": 2190000
1353
+ },
1354
+ {
1355
+ "epoch": 3.89,
1356
+ "learning_rate": 3.0562438018062925e-05,
1357
+ "loss": 1.5561,
1358
+ "step": 2200000
1359
+ },
1360
+ {
1361
+ "epoch": 3.91,
1362
+ "learning_rate": 3.047409442290729e-05,
1363
+ "loss": 1.5536,
1364
+ "step": 2210000
1365
+ },
1366
+ {
1367
+ "epoch": 3.92,
1368
+ "learning_rate": 3.0385733150193836e-05,
1369
+ "loss": 1.5567,
1370
+ "step": 2220000
1371
+ },
1372
+ {
1373
+ "epoch": 3.94,
1374
+ "learning_rate": 3.0297371877480386e-05,
1375
+ "loss": 1.5514,
1376
+ "step": 2230000
1377
+ },
1378
+ {
1379
+ "epoch": 3.96,
1380
+ "learning_rate": 3.0209028282324743e-05,
1381
+ "loss": 1.5542,
1382
+ "step": 2240000
1383
+ },
1384
+ {
1385
+ "epoch": 3.98,
1386
+ "learning_rate": 3.0120675848390195e-05,
1387
+ "loss": 1.5515,
1388
+ "step": 2250000
1389
+ },
1390
+ {
1391
+ "epoch": 4.0,
1392
+ "learning_rate": 3.0032323414455647e-05,
1393
+ "loss": 1.5515,
1394
+ "step": 2260000
1395
+ },
1396
+ {
1397
+ "epoch": 4.0,
1398
+ "eval_bleu": 30.588,
1399
+ "eval_gen_len": 65.9093,
1400
+ "eval_loss": 1.6601390838623047,
1401
+ "eval_runtime": 3100.6175,
1402
+ "eval_samples_per_second": 5.536,
1403
+ "eval_steps_per_second": 0.346,
1404
+ "step": 2262756
1405
+ },
1406
+ {
1407
+ "epoch": 4.01,
1408
+ "learning_rate": 2.9943970980521102e-05,
1409
+ "loss": 1.5183,
1410
+ "step": 2270000
1411
+ },
1412
+ {
1413
+ "epoch": 4.03,
1414
+ "learning_rate": 2.9855609707807647e-05,
1415
+ "loss": 1.5123,
1416
+ "step": 2280000
1417
+ },
1418
+ {
1419
+ "epoch": 4.05,
1420
+ "learning_rate": 2.9767257273873102e-05,
1421
+ "loss": 1.5146,
1422
+ "step": 2290000
1423
+ },
1424
+ {
1425
+ "epoch": 4.07,
1426
+ "learning_rate": 2.9678904839938554e-05,
1427
+ "loss": 1.5144,
1428
+ "step": 2300000
1429
+ },
1430
+ {
1431
+ "epoch": 4.08,
1432
+ "learning_rate": 2.9590552406004006e-05,
1433
+ "loss": 1.5173,
1434
+ "step": 2310000
1435
+ },
1436
+ {
1437
+ "epoch": 4.1,
1438
+ "learning_rate": 2.9502191133290557e-05,
1439
+ "loss": 1.5188,
1440
+ "step": 2320000
1441
+ },
1442
+ {
1443
+ "epoch": 4.12,
1444
+ "learning_rate": 2.9413838699356006e-05,
1445
+ "loss": 1.5195,
1446
+ "step": 2330000
1447
+ },
1448
+ {
1449
+ "epoch": 4.14,
1450
+ "learning_rate": 2.932549510420037e-05,
1451
+ "loss": 1.5181,
1452
+ "step": 2340000
1453
+ },
1454
+ {
1455
+ "epoch": 4.15,
1456
+ "learning_rate": 2.923714267026582e-05,
1457
+ "loss": 1.5199,
1458
+ "step": 2350000
1459
+ },
1460
+ {
1461
+ "epoch": 4.17,
1462
+ "learning_rate": 2.9148799075110177e-05,
1463
+ "loss": 1.5206,
1464
+ "step": 2360000
1465
+ },
1466
+ {
1467
+ "epoch": 4.19,
1468
+ "learning_rate": 2.9060446641175633e-05,
1469
+ "loss": 1.5207,
1470
+ "step": 2370000
1471
+ },
1472
+ {
1473
+ "epoch": 4.21,
1474
+ "learning_rate": 2.8972094207241085e-05,
1475
+ "loss": 1.5209,
1476
+ "step": 2380000
1477
+ },
1478
+ {
1479
+ "epoch": 4.22,
1480
+ "learning_rate": 2.8883741773306534e-05,
1481
+ "loss": 1.5192,
1482
+ "step": 2390000
1483
+ },
1484
+ {
1485
+ "epoch": 4.24,
1486
+ "learning_rate": 2.8795380500593084e-05,
1487
+ "loss": 1.5231,
1488
+ "step": 2400000
1489
+ },
1490
+ {
1491
+ "epoch": 4.26,
1492
+ "learning_rate": 2.870703690543744e-05,
1493
+ "loss": 1.5201,
1494
+ "step": 2410000
1495
+ },
1496
+ {
1497
+ "epoch": 4.28,
1498
+ "learning_rate": 2.8618684471502893e-05,
1499
+ "loss": 1.5204,
1500
+ "step": 2420000
1501
+ },
1502
+ {
1503
+ "epoch": 4.3,
1504
+ "learning_rate": 2.853033203756835e-05,
1505
+ "loss": 1.5224,
1506
+ "step": 2430000
1507
+ },
1508
+ {
1509
+ "epoch": 4.31,
1510
+ "learning_rate": 2.84419796036338e-05,
1511
+ "loss": 1.5221,
1512
+ "step": 2440000
1513
+ },
1514
+ {
1515
+ "epoch": 4.33,
1516
+ "learning_rate": 2.8353627169699252e-05,
1517
+ "loss": 1.52,
1518
+ "step": 2450000
1519
+ },
1520
+ {
1521
+ "epoch": 4.35,
1522
+ "learning_rate": 2.8265274735764708e-05,
1523
+ "loss": 1.5212,
1524
+ "step": 2460000
1525
+ },
1526
+ {
1527
+ "epoch": 4.37,
1528
+ "learning_rate": 2.8176931140609064e-05,
1529
+ "loss": 1.5241,
1530
+ "step": 2470000
1531
+ },
1532
+ {
1533
+ "epoch": 4.38,
1534
+ "learning_rate": 2.8088578706674516e-05,
1535
+ "loss": 1.5224,
1536
+ "step": 2480000
1537
+ },
1538
+ {
1539
+ "epoch": 4.4,
1540
+ "learning_rate": 2.8000235111518873e-05,
1541
+ "loss": 1.5185,
1542
+ "step": 2490000
1543
+ },
1544
+ {
1545
+ "epoch": 4.42,
1546
+ "learning_rate": 2.791188267758433e-05,
1547
+ "loss": 1.5184,
1548
+ "step": 2500000
1549
+ },
1550
+ {
1551
+ "epoch": 4.44,
1552
+ "learning_rate": 2.7823539082428685e-05,
1553
+ "loss": 1.5199,
1554
+ "step": 2510000
1555
+ },
1556
+ {
1557
+ "epoch": 4.45,
1558
+ "learning_rate": 2.7735186648494137e-05,
1559
+ "loss": 1.5198,
1560
+ "step": 2520000
1561
+ },
1562
+ {
1563
+ "epoch": 4.47,
1564
+ "learning_rate": 2.7646834214559592e-05,
1565
+ "loss": 1.5195,
1566
+ "step": 2530000
1567
+ },
1568
+ {
1569
+ "epoch": 4.49,
1570
+ "learning_rate": 2.7558481780625044e-05,
1571
+ "loss": 1.522,
1572
+ "step": 2540000
1573
+ },
1574
+ {
1575
+ "epoch": 4.51,
1576
+ "learning_rate": 2.74701293466905e-05,
1577
+ "loss": 1.5223,
1578
+ "step": 2550000
1579
+ },
1580
+ {
1581
+ "epoch": 4.53,
1582
+ "learning_rate": 2.7381776912755952e-05,
1583
+ "loss": 1.5183,
1584
+ "step": 2560000
1585
+ },
1586
+ {
1587
+ "epoch": 4.54,
1588
+ "learning_rate": 2.7293433317600308e-05,
1589
+ "loss": 1.5225,
1590
+ "step": 2570000
1591
+ },
1592
+ {
1593
+ "epoch": 4.56,
1594
+ "learning_rate": 2.7205080883665764e-05,
1595
+ "loss": 1.5198,
1596
+ "step": 2580000
1597
+ },
1598
+ {
1599
+ "epoch": 4.58,
1600
+ "learning_rate": 2.7116728449731216e-05,
1601
+ "loss": 1.5201,
1602
+ "step": 2590000
1603
+ },
1604
+ {
1605
+ "epoch": 4.6,
1606
+ "learning_rate": 2.7028384854575572e-05,
1607
+ "loss": 1.5195,
1608
+ "step": 2600000
1609
+ },
1610
+ {
1611
+ "epoch": 4.61,
1612
+ "learning_rate": 2.6940032420641028e-05,
1613
+ "loss": 1.52,
1614
+ "step": 2610000
1615
+ },
1616
+ {
1617
+ "epoch": 4.63,
1618
+ "learning_rate": 2.6851688825485384e-05,
1619
+ "loss": 1.5202,
1620
+ "step": 2620000
1621
+ },
1622
+ {
1623
+ "epoch": 4.65,
1624
+ "learning_rate": 2.6763336391550836e-05,
1625
+ "loss": 1.5192,
1626
+ "step": 2630000
1627
+ },
1628
+ {
1629
+ "epoch": 4.67,
1630
+ "learning_rate": 2.6674983957616288e-05,
1631
+ "loss": 1.5176,
1632
+ "step": 2640000
1633
+ },
1634
+ {
1635
+ "epoch": 4.68,
1636
+ "learning_rate": 2.6586631523681744e-05,
1637
+ "loss": 1.5153,
1638
+ "step": 2650000
1639
+ },
1640
+ {
1641
+ "epoch": 4.7,
1642
+ "learning_rate": 2.64982879285261e-05,
1643
+ "loss": 1.5156,
1644
+ "step": 2660000
1645
+ },
1646
+ {
1647
+ "epoch": 4.72,
1648
+ "learning_rate": 2.640992665581265e-05,
1649
+ "loss": 1.5191,
1650
+ "step": 2670000
1651
+ },
1652
+ {
1653
+ "epoch": 4.74,
1654
+ "learning_rate": 2.6321583060657008e-05,
1655
+ "loss": 1.5147,
1656
+ "step": 2680000
1657
+ },
1658
+ {
1659
+ "epoch": 4.76,
1660
+ "learning_rate": 2.6233239465501364e-05,
1661
+ "loss": 1.5169,
1662
+ "step": 2690000
1663
+ },
1664
+ {
1665
+ "epoch": 4.77,
1666
+ "learning_rate": 2.614489587034572e-05,
1667
+ "loss": 1.5174,
1668
+ "step": 2700000
1669
+ },
1670
+ {
1671
+ "epoch": 4.79,
1672
+ "learning_rate": 2.605653459763227e-05,
1673
+ "loss": 1.5164,
1674
+ "step": 2710000
1675
+ },
1676
+ {
1677
+ "epoch": 4.81,
1678
+ "learning_rate": 2.5968182163697724e-05,
1679
+ "loss": 1.5134,
1680
+ "step": 2720000
1681
+ },
1682
+ {
1683
+ "epoch": 4.83,
1684
+ "learning_rate": 2.587983856854208e-05,
1685
+ "loss": 1.5166,
1686
+ "step": 2730000
1687
+ },
1688
+ {
1689
+ "epoch": 4.84,
1690
+ "learning_rate": 2.5791486134607535e-05,
1691
+ "loss": 1.5162,
1692
+ "step": 2740000
1693
+ },
1694
+ {
1695
+ "epoch": 4.86,
1696
+ "learning_rate": 2.5703142539451892e-05,
1697
+ "loss": 1.5122,
1698
+ "step": 2750000
1699
+ },
1700
+ {
1701
+ "epoch": 4.88,
1702
+ "learning_rate": 2.5614781266738443e-05,
1703
+ "loss": 1.5134,
1704
+ "step": 2760000
1705
+ },
1706
+ {
1707
+ "epoch": 4.9,
1708
+ "learning_rate": 2.55264376715828e-05,
1709
+ "loss": 1.5135,
1710
+ "step": 2770000
1711
+ },
1712
+ {
1713
+ "epoch": 4.91,
1714
+ "learning_rate": 2.543808523764825e-05,
1715
+ "loss": 1.511,
1716
+ "step": 2780000
1717
+ },
1718
+ {
1719
+ "epoch": 4.93,
1720
+ "learning_rate": 2.5349741642492608e-05,
1721
+ "loss": 1.5089,
1722
+ "step": 2790000
1723
+ },
1724
+ {
1725
+ "epoch": 4.95,
1726
+ "learning_rate": 2.5261389208558063e-05,
1727
+ "loss": 1.5149,
1728
+ "step": 2800000
1729
+ },
1730
+ {
1731
+ "epoch": 4.97,
1732
+ "learning_rate": 2.517304561340242e-05,
1733
+ "loss": 1.5121,
1734
+ "step": 2810000
1735
+ },
1736
+ {
1737
+ "epoch": 4.99,
1738
+ "learning_rate": 2.5084693179467872e-05,
1739
+ "loss": 1.5115,
1740
+ "step": 2820000
1741
+ },
1742
+ {
1743
+ "epoch": 5.0,
1744
+ "eval_bleu": 30.9726,
1745
+ "eval_gen_len": 66.2171,
1746
+ "eval_loss": 1.6359007358551025,
1747
+ "eval_runtime": 3064.0636,
1748
+ "eval_samples_per_second": 5.602,
1749
+ "eval_steps_per_second": 0.35,
1750
+ "step": 2828445
1751
+ },
1752
+ {
1753
+ "epoch": 5.0,
1754
+ "learning_rate": 2.4996340745533324e-05,
1755
+ "loss": 1.5045,
1756
+ "step": 2830000
1757
+ },
1758
+ {
1759
+ "epoch": 5.02,
1760
+ "learning_rate": 2.4907988311598776e-05,
1761
+ "loss": 1.4705,
1762
+ "step": 2840000
1763
+ },
1764
+ {
1765
+ "epoch": 5.04,
1766
+ "learning_rate": 2.4819635877664228e-05,
1767
+ "loss": 1.4722,
1768
+ "step": 2850000
1769
+ },
1770
+ {
1771
+ "epoch": 5.06,
1772
+ "learning_rate": 2.4731292282508588e-05,
1773
+ "loss": 1.4756,
1774
+ "step": 2860000
1775
+ },
1776
+ {
1777
+ "epoch": 5.07,
1778
+ "learning_rate": 2.464293984857404e-05,
1779
+ "loss": 1.473,
1780
+ "step": 2870000
1781
+ },
1782
+ {
1783
+ "epoch": 5.09,
1784
+ "learning_rate": 2.45545962534184e-05,
1785
+ "loss": 1.4754,
1786
+ "step": 2880000
1787
+ },
1788
+ {
1789
+ "epoch": 5.11,
1790
+ "learning_rate": 2.4466252658262756e-05,
1791
+ "loss": 1.4775,
1792
+ "step": 2890000
1793
+ },
1794
+ {
1795
+ "epoch": 5.13,
1796
+ "learning_rate": 2.4377909063107116e-05,
1797
+ "loss": 1.4779,
1798
+ "step": 2900000
1799
+ },
1800
+ {
1801
+ "epoch": 5.14,
1802
+ "learning_rate": 2.4289556629172568e-05,
1803
+ "loss": 1.4802,
1804
+ "step": 2910000
1805
+ },
1806
+ {
1807
+ "epoch": 5.16,
1808
+ "learning_rate": 2.420120419523802e-05,
1809
+ "loss": 1.479,
1810
+ "step": 2920000
1811
+ },
1812
+ {
1813
+ "epoch": 5.18,
1814
+ "learning_rate": 2.4112851761303472e-05,
1815
+ "loss": 1.4802,
1816
+ "step": 2930000
1817
+ },
1818
+ {
1819
+ "epoch": 5.2,
1820
+ "learning_rate": 2.4024499327368928e-05,
1821
+ "loss": 1.4796,
1822
+ "step": 2940000
1823
+ },
1824
+ {
1825
+ "epoch": 5.21,
1826
+ "learning_rate": 2.3936155732213284e-05,
1827
+ "loss": 1.4798,
1828
+ "step": 2950000
1829
+ },
1830
+ {
1831
+ "epoch": 5.23,
1832
+ "learning_rate": 2.3847812137057644e-05,
1833
+ "loss": 1.4808,
1834
+ "step": 2960000
1835
+ },
1836
+ {
1837
+ "epoch": 5.25,
1838
+ "learning_rate": 2.3759459703123096e-05,
1839
+ "loss": 1.4811,
1840
+ "step": 2970000
1841
+ },
1842
+ {
1843
+ "epoch": 5.27,
1844
+ "learning_rate": 2.3671107269188548e-05,
1845
+ "loss": 1.4795,
1846
+ "step": 2980000
1847
+ },
1848
+ {
1849
+ "epoch": 5.29,
1850
+ "learning_rate": 2.3582763674032908e-05,
1851
+ "loss": 1.4812,
1852
+ "step": 2990000
1853
+ },
1854
+ {
1855
+ "epoch": 5.3,
1856
+ "learning_rate": 2.349441124009836e-05,
1857
+ "loss": 1.4828,
1858
+ "step": 3000000
1859
+ },
1860
+ {
1861
+ "epoch": 5.32,
1862
+ "learning_rate": 2.3406067644942717e-05,
1863
+ "loss": 1.4809,
1864
+ "step": 3010000
1865
+ },
1866
+ {
1867
+ "epoch": 5.34,
1868
+ "learning_rate": 2.3317715211008172e-05,
1869
+ "loss": 1.4843,
1870
+ "step": 3020000
1871
+ },
1872
+ {
1873
+ "epoch": 5.36,
1874
+ "learning_rate": 2.3229362777073624e-05,
1875
+ "loss": 1.4821,
1876
+ "step": 3030000
1877
+ },
1878
+ {
1879
+ "epoch": 5.37,
1880
+ "learning_rate": 2.3141010343139076e-05,
1881
+ "loss": 1.4808,
1882
+ "step": 3040000
1883
+ },
1884
+ {
1885
+ "epoch": 5.39,
1886
+ "learning_rate": 2.3052666747983432e-05,
1887
+ "loss": 1.4815,
1888
+ "step": 3050000
1889
+ },
1890
+ {
1891
+ "epoch": 5.41,
1892
+ "learning_rate": 2.2964323152827792e-05,
1893
+ "loss": 1.4796,
1894
+ "step": 3060000
1895
+ },
1896
+ {
1897
+ "epoch": 5.43,
1898
+ "learning_rate": 2.2875970718893244e-05,
1899
+ "loss": 1.4823,
1900
+ "step": 3070000
1901
+ },
1902
+ {
1903
+ "epoch": 5.44,
1904
+ "learning_rate": 2.2787618284958696e-05,
1905
+ "loss": 1.4802,
1906
+ "step": 3080000
1907
+ },
1908
+ {
1909
+ "epoch": 5.46,
1910
+ "learning_rate": 2.2699265851024152e-05,
1911
+ "loss": 1.4814,
1912
+ "step": 3090000
1913
+ },
1914
+ {
1915
+ "epoch": 5.48,
1916
+ "learning_rate": 2.26109134170896e-05,
1917
+ "loss": 1.482,
1918
+ "step": 3100000
1919
+ },
1920
+ {
1921
+ "epoch": 5.5,
1922
+ "learning_rate": 2.2522560983155056e-05,
1923
+ "loss": 1.4808,
1924
+ "step": 3110000
1925
+ },
1926
+ {
1927
+ "epoch": 5.52,
1928
+ "learning_rate": 2.2434208549220508e-05,
1929
+ "loss": 1.4823,
1930
+ "step": 3120000
1931
+ },
1932
+ {
1933
+ "epoch": 5.53,
1934
+ "learning_rate": 2.2345856115285963e-05,
1935
+ "loss": 1.4807,
1936
+ "step": 3130000
1937
+ },
1938
+ {
1939
+ "epoch": 5.55,
1940
+ "learning_rate": 2.225751252013032e-05,
1941
+ "loss": 1.4821,
1942
+ "step": 3140000
1943
+ },
1944
+ {
1945
+ "epoch": 5.57,
1946
+ "learning_rate": 2.216916892497468e-05,
1947
+ "loss": 1.4789,
1948
+ "step": 3150000
1949
+ },
1950
+ {
1951
+ "epoch": 5.59,
1952
+ "learning_rate": 2.2080825329819036e-05,
1953
+ "loss": 1.4809,
1954
+ "step": 3160000
1955
+ },
1956
+ {
1957
+ "epoch": 5.6,
1958
+ "learning_rate": 2.199245521832668e-05,
1959
+ "loss": 1.478,
1960
+ "step": 3170000
1961
+ },
1962
+ {
1963
+ "epoch": 5.62,
1964
+ "learning_rate": 2.1904111623171036e-05,
1965
+ "loss": 1.4817,
1966
+ "step": 3180000
1967
+ },
1968
+ {
1969
+ "epoch": 5.64,
1970
+ "learning_rate": 2.1815768028015396e-05,
1971
+ "loss": 1.4811,
1972
+ "step": 3190000
1973
+ },
1974
+ {
1975
+ "epoch": 5.66,
1976
+ "learning_rate": 2.1727415594080848e-05,
1977
+ "loss": 1.4791,
1978
+ "step": 3200000
1979
+ },
1980
+ {
1981
+ "epoch": 5.67,
1982
+ "learning_rate": 2.16390631601463e-05,
1983
+ "loss": 1.4789,
1984
+ "step": 3210000
1985
+ },
1986
+ {
1987
+ "epoch": 5.69,
1988
+ "learning_rate": 2.155071956499066e-05,
1989
+ "loss": 1.4778,
1990
+ "step": 3220000
1991
+ },
1992
+ {
1993
+ "epoch": 5.71,
1994
+ "learning_rate": 2.1462358292277207e-05,
1995
+ "loss": 1.4785,
1996
+ "step": 3230000
1997
+ },
1998
+ {
1999
+ "epoch": 5.73,
2000
+ "learning_rate": 2.137400585834266e-05,
2001
+ "loss": 1.4769,
2002
+ "step": 3240000
2003
+ },
2004
+ {
2005
+ "epoch": 5.75,
2006
+ "learning_rate": 2.128565342440811e-05,
2007
+ "loss": 1.4783,
2008
+ "step": 3250000
2009
+ },
2010
+ {
2011
+ "epoch": 5.76,
2012
+ "learning_rate": 2.119730982925247e-05,
2013
+ "loss": 1.4793,
2014
+ "step": 3260000
2015
+ },
2016
+ {
2017
+ "epoch": 5.78,
2018
+ "learning_rate": 2.1108966234096828e-05,
2019
+ "loss": 1.48,
2020
+ "step": 3270000
2021
+ },
2022
+ {
2023
+ "epoch": 5.8,
2024
+ "learning_rate": 2.102061380016228e-05,
2025
+ "loss": 1.4794,
2026
+ "step": 3280000
2027
+ },
2028
+ {
2029
+ "epoch": 5.82,
2030
+ "learning_rate": 2.0932261366227735e-05,
2031
+ "loss": 1.4788,
2032
+ "step": 3290000
2033
+ },
2034
+ {
2035
+ "epoch": 5.83,
2036
+ "learning_rate": 2.084391777107209e-05,
2037
+ "loss": 1.4791,
2038
+ "step": 3300000
2039
+ },
2040
+ {
2041
+ "epoch": 5.85,
2042
+ "learning_rate": 2.075557417591645e-05,
2043
+ "loss": 1.4779,
2044
+ "step": 3310000
2045
+ },
2046
+ {
2047
+ "epoch": 5.87,
2048
+ "learning_rate": 2.06672217419819e-05,
2049
+ "loss": 1.4751,
2050
+ "step": 3320000
2051
+ },
2052
+ {
2053
+ "epoch": 5.89,
2054
+ "learning_rate": 2.0578869308047356e-05,
2055
+ "loss": 1.4763,
2056
+ "step": 3330000
2057
+ },
2058
+ {
2059
+ "epoch": 5.9,
2060
+ "learning_rate": 2.0490508035333903e-05,
2061
+ "loss": 1.4751,
2062
+ "step": 3340000
2063
+ },
2064
+ {
2065
+ "epoch": 5.92,
2066
+ "learning_rate": 2.0402164440178263e-05,
2067
+ "loss": 1.4753,
2068
+ "step": 3350000
2069
+ },
2070
+ {
2071
+ "epoch": 5.94,
2072
+ "learning_rate": 2.0313812006243715e-05,
2073
+ "loss": 1.4737,
2074
+ "step": 3360000
2075
+ },
2076
+ {
2077
+ "epoch": 5.96,
2078
+ "learning_rate": 2.022546841108807e-05,
2079
+ "loss": 1.4755,
2080
+ "step": 3370000
2081
+ },
2082
+ {
2083
+ "epoch": 5.98,
2084
+ "learning_rate": 2.013710713837462e-05,
2085
+ "loss": 1.4756,
2086
+ "step": 3380000
2087
+ },
2088
+ {
2089
+ "epoch": 5.99,
2090
+ "learning_rate": 2.004876354321898e-05,
2091
+ "loss": 1.474,
2092
+ "step": 3390000
2093
+ },
2094
+ {
2095
+ "epoch": 6.0,
2096
+ "eval_bleu": 31.3244,
2097
+ "eval_gen_len": 66.1843,
2098
+ "eval_loss": 1.6097419261932373,
2099
+ "eval_runtime": 3209.3116,
2100
+ "eval_samples_per_second": 5.348,
2101
+ "eval_steps_per_second": 0.334,
2102
+ "step": 3394134
2103
+ },
2104
+ {
2105
+ "epoch": 6.01,
2106
+ "learning_rate": 1.9960402270505527e-05,
2107
+ "loss": 1.4509,
2108
+ "step": 3400000
2109
+ },
2110
+ {
2111
+ "epoch": 6.03,
2112
+ "learning_rate": 1.9872058675349883e-05,
2113
+ "loss": 1.4364,
2114
+ "step": 3410000
2115
+ },
2116
+ {
2117
+ "epoch": 6.05,
2118
+ "learning_rate": 1.978370624141534e-05,
2119
+ "loss": 1.4356,
2120
+ "step": 3420000
2121
+ },
2122
+ {
2123
+ "epoch": 6.06,
2124
+ "learning_rate": 1.969535380748079e-05,
2125
+ "loss": 1.4374,
2126
+ "step": 3430000
2127
+ },
2128
+ {
2129
+ "epoch": 6.08,
2130
+ "learning_rate": 1.9606992534767338e-05,
2131
+ "loss": 1.4387,
2132
+ "step": 3440000
2133
+ },
2134
+ {
2135
+ "epoch": 6.1,
2136
+ "learning_rate": 1.951864010083279e-05,
2137
+ "loss": 1.4395,
2138
+ "step": 3450000
2139
+ },
2140
+ {
2141
+ "epoch": 6.12,
2142
+ "learning_rate": 1.9430278828119338e-05,
2143
+ "loss": 1.4417,
2144
+ "step": 3460000
2145
+ },
2146
+ {
2147
+ "epoch": 6.13,
2148
+ "learning_rate": 1.9341935232963694e-05,
2149
+ "loss": 1.4411,
2150
+ "step": 3470000
2151
+ },
2152
+ {
2153
+ "epoch": 6.15,
2154
+ "learning_rate": 1.9253573960250245e-05,
2155
+ "loss": 1.4406,
2156
+ "step": 3480000
2157
+ },
2158
+ {
2159
+ "epoch": 6.17,
2160
+ "learning_rate": 1.9165221526315697e-05,
2161
+ "loss": 1.4435,
2162
+ "step": 3490000
2163
+ },
2164
+ {
2165
+ "epoch": 6.19,
2166
+ "learning_rate": 1.907686909238115e-05,
2167
+ "loss": 1.4451,
2168
+ "step": 3500000
2169
+ },
2170
+ {
2171
+ "epoch": 6.2,
2172
+ "learning_rate": 1.8988516658446605e-05,
2173
+ "loss": 1.4467,
2174
+ "step": 3510000
2175
+ },
2176
+ {
2177
+ "epoch": 6.22,
2178
+ "learning_rate": 1.8900164224512057e-05,
2179
+ "loss": 1.443,
2180
+ "step": 3520000
2181
+ },
2182
+ {
2183
+ "epoch": 6.24,
2184
+ "learning_rate": 1.8811811790577512e-05,
2185
+ "loss": 1.4465,
2186
+ "step": 3530000
2187
+ },
2188
+ {
2189
+ "epoch": 6.26,
2190
+ "learning_rate": 1.872345051786406e-05,
2191
+ "loss": 1.4401,
2192
+ "step": 3540000
2193
+ },
2194
+ {
2195
+ "epoch": 6.28,
2196
+ "learning_rate": 1.8635098083929508e-05,
2197
+ "loss": 1.4423,
2198
+ "step": 3550000
2199
+ },
2200
+ {
2201
+ "epoch": 6.29,
2202
+ "learning_rate": 1.8546745649994964e-05,
2203
+ "loss": 1.4443,
2204
+ "step": 3560000
2205
+ },
2206
+ {
2207
+ "epoch": 6.31,
2208
+ "learning_rate": 1.8458393216060416e-05,
2209
+ "loss": 1.4441,
2210
+ "step": 3570000
2211
+ },
2212
+ {
2213
+ "epoch": 6.33,
2214
+ "learning_rate": 1.8370040782125868e-05,
2215
+ "loss": 1.4448,
2216
+ "step": 3580000
2217
+ },
2218
+ {
2219
+ "epoch": 6.35,
2220
+ "learning_rate": 1.8281688348191323e-05,
2221
+ "loss": 1.4438,
2222
+ "step": 3590000
2223
+ },
2224
+ {
2225
+ "epoch": 6.36,
2226
+ "learning_rate": 1.819333591425677e-05,
2227
+ "loss": 1.4479,
2228
+ "step": 3600000
2229
+ },
2230
+ {
2231
+ "epoch": 6.38,
2232
+ "learning_rate": 1.8104983480322227e-05,
2233
+ "loss": 1.4444,
2234
+ "step": 3610000
2235
+ },
2236
+ {
2237
+ "epoch": 6.4,
2238
+ "learning_rate": 1.8016622207608775e-05,
2239
+ "loss": 1.4464,
2240
+ "step": 3620000
2241
+ },
2242
+ {
2243
+ "epoch": 6.42,
2244
+ "learning_rate": 1.7928278612453135e-05,
2245
+ "loss": 1.4453,
2246
+ "step": 3630000
2247
+ },
2248
+ {
2249
+ "epoch": 6.43,
2250
+ "learning_rate": 1.7839917339739682e-05,
2251
+ "loss": 1.4437,
2252
+ "step": 3640000
2253
+ },
2254
+ {
2255
+ "epoch": 6.45,
2256
+ "learning_rate": 1.775157374458404e-05,
2257
+ "loss": 1.4461,
2258
+ "step": 3650000
2259
+ },
2260
+ {
2261
+ "epoch": 6.47,
2262
+ "learning_rate": 1.766322131064949e-05,
2263
+ "loss": 1.4461,
2264
+ "step": 3660000
2265
+ },
2266
+ {
2267
+ "epoch": 6.49,
2268
+ "learning_rate": 1.7574868876714946e-05,
2269
+ "loss": 1.4452,
2270
+ "step": 3670000
2271
+ },
2272
+ {
2273
+ "epoch": 6.51,
2274
+ "learning_rate": 1.7486516442780398e-05,
2275
+ "loss": 1.4465,
2276
+ "step": 3680000
2277
+ },
2278
+ {
2279
+ "epoch": 6.52,
2280
+ "learning_rate": 1.7398164008845853e-05,
2281
+ "loss": 1.4484,
2282
+ "step": 3690000
2283
+ },
2284
+ {
2285
+ "epoch": 6.54,
2286
+ "learning_rate": 1.7309793897353493e-05,
2287
+ "loss": 1.4447,
2288
+ "step": 3700000
2289
+ },
2290
+ {
2291
+ "epoch": 6.56,
2292
+ "learning_rate": 1.7221450302197853e-05,
2293
+ "loss": 1.4449,
2294
+ "step": 3710000
2295
+ },
2296
+ {
2297
+ "epoch": 6.58,
2298
+ "learning_rate": 1.71330890294844e-05,
2299
+ "loss": 1.4437,
2300
+ "step": 3720000
2301
+ },
2302
+ {
2303
+ "epoch": 6.59,
2304
+ "learning_rate": 1.7044736595549853e-05,
2305
+ "loss": 1.4435,
2306
+ "step": 3730000
2307
+ },
2308
+ {
2309
+ "epoch": 6.61,
2310
+ "learning_rate": 1.695639300039421e-05,
2311
+ "loss": 1.4453,
2312
+ "step": 3740000
2313
+ },
2314
+ {
2315
+ "epoch": 6.63,
2316
+ "learning_rate": 1.6868031727680757e-05,
2317
+ "loss": 1.4469,
2318
+ "step": 3750000
2319
+ },
2320
+ {
2321
+ "epoch": 6.65,
2322
+ "learning_rate": 1.6779688132525113e-05,
2323
+ "loss": 1.4446,
2324
+ "step": 3760000
2325
+ },
2326
+ {
2327
+ "epoch": 6.66,
2328
+ "learning_rate": 1.6691326859811664e-05,
2329
+ "loss": 1.4432,
2330
+ "step": 3770000
2331
+ },
2332
+ {
2333
+ "epoch": 6.68,
2334
+ "learning_rate": 1.6602974425877116e-05,
2335
+ "loss": 1.4411,
2336
+ "step": 3780000
2337
+ },
2338
+ {
2339
+ "epoch": 6.7,
2340
+ "learning_rate": 1.6514621991942568e-05,
2341
+ "loss": 1.4424,
2342
+ "step": 3790000
2343
+ },
2344
+ {
2345
+ "epoch": 6.72,
2346
+ "learning_rate": 1.6426260719229116e-05,
2347
+ "loss": 1.4423,
2348
+ "step": 3800000
2349
+ },
2350
+ {
2351
+ "epoch": 6.74,
2352
+ "learning_rate": 1.633790828529457e-05,
2353
+ "loss": 1.4437,
2354
+ "step": 3810000
2355
+ },
2356
+ {
2357
+ "epoch": 6.75,
2358
+ "learning_rate": 1.6249555851360023e-05,
2359
+ "loss": 1.4468,
2360
+ "step": 3820000
2361
+ },
2362
+ {
2363
+ "epoch": 6.77,
2364
+ "learning_rate": 1.616119457864657e-05,
2365
+ "loss": 1.4434,
2366
+ "step": 3830000
2367
+ },
2368
+ {
2369
+ "epoch": 6.79,
2370
+ "learning_rate": 1.607285098349093e-05,
2371
+ "loss": 1.4394,
2372
+ "step": 3840000
2373
+ },
2374
+ {
2375
+ "epoch": 6.81,
2376
+ "learning_rate": 1.5984498549556383e-05,
2377
+ "loss": 1.4409,
2378
+ "step": 3850000
2379
+ },
2380
+ {
2381
+ "epoch": 6.82,
2382
+ "learning_rate": 1.5896146115621835e-05,
2383
+ "loss": 1.4404,
2384
+ "step": 3860000
2385
+ },
2386
+ {
2387
+ "epoch": 6.84,
2388
+ "learning_rate": 1.580779368168729e-05,
2389
+ "loss": 1.4402,
2390
+ "step": 3870000
2391
+ },
2392
+ {
2393
+ "epoch": 6.86,
2394
+ "learning_rate": 1.5719441247752742e-05,
2395
+ "loss": 1.4409,
2396
+ "step": 3880000
2397
+ },
2398
+ {
2399
+ "epoch": 6.88,
2400
+ "learning_rate": 1.56310976525971e-05,
2401
+ "loss": 1.4401,
2402
+ "step": 3890000
2403
+ },
2404
+ {
2405
+ "epoch": 6.89,
2406
+ "learning_rate": 1.5542736379883646e-05,
2407
+ "loss": 1.4392,
2408
+ "step": 3900000
2409
+ },
2410
+ {
2411
+ "epoch": 6.91,
2412
+ "learning_rate": 1.5454392784728006e-05,
2413
+ "loss": 1.4408,
2414
+ "step": 3910000
2415
+ },
2416
+ {
2417
+ "epoch": 6.93,
2418
+ "learning_rate": 1.5366031512014554e-05,
2419
+ "loss": 1.4408,
2420
+ "step": 3920000
2421
+ },
2422
+ {
2423
+ "epoch": 6.95,
2424
+ "learning_rate": 1.5277679078080006e-05,
2425
+ "loss": 1.4398,
2426
+ "step": 3930000
2427
+ },
2428
+ {
2429
+ "epoch": 6.96,
2430
+ "learning_rate": 1.5189326644145458e-05,
2431
+ "loss": 1.4415,
2432
+ "step": 3940000
2433
+ },
2434
+ {
2435
+ "epoch": 6.98,
2436
+ "learning_rate": 1.5100974210210911e-05,
2437
+ "loss": 1.4425,
2438
+ "step": 3950000
2439
+ },
2440
+ {
2441
+ "epoch": 7.0,
2442
+ "eval_bleu": 31.557,
2443
+ "eval_gen_len": 66.1481,
2444
+ "eval_loss": 1.5914360284805298,
2445
+ "eval_runtime": 3218.9621,
2446
+ "eval_samples_per_second": 5.332,
2447
+ "eval_steps_per_second": 0.333,
2448
+ "step": 3959823
2449
+ },
2450
+ {
2451
+ "epoch": 7.0,
2452
+ "learning_rate": 1.5012621776276365e-05,
2453
+ "loss": 1.4396,
2454
+ "step": 3960000
2455
+ },
2456
+ {
2457
+ "epoch": 7.02,
2458
+ "learning_rate": 1.4924269342341817e-05,
2459
+ "loss": 1.4025,
2460
+ "step": 3970000
2461
+ },
2462
+ {
2463
+ "epoch": 7.04,
2464
+ "learning_rate": 1.4835916908407271e-05,
2465
+ "loss": 1.4013,
2466
+ "step": 3980000
2467
+ },
2468
+ {
2469
+ "epoch": 7.05,
2470
+ "learning_rate": 1.4747564474472723e-05,
2471
+ "loss": 1.4035,
2472
+ "step": 3990000
2473
+ },
2474
+ {
2475
+ "epoch": 7.07,
2476
+ "learning_rate": 1.4659212040538175e-05,
2477
+ "loss": 1.4054,
2478
+ "step": 4000000
2479
+ },
2480
+ {
2481
+ "epoch": 7.09,
2482
+ "learning_rate": 1.4570850767824722e-05,
2483
+ "loss": 1.4067,
2484
+ "step": 4010000
2485
+ },
2486
+ {
2487
+ "epoch": 7.11,
2488
+ "learning_rate": 1.4482498333890176e-05,
2489
+ "loss": 1.4044,
2490
+ "step": 4020000
2491
+ },
2492
+ {
2493
+ "epoch": 7.12,
2494
+ "learning_rate": 1.439414589995563e-05,
2495
+ "loss": 1.4081,
2496
+ "step": 4030000
2497
+ },
2498
+ {
2499
+ "epoch": 7.14,
2500
+ "learning_rate": 1.4305802304799986e-05,
2501
+ "loss": 1.4049,
2502
+ "step": 4040000
2503
+ },
2504
+ {
2505
+ "epoch": 7.16,
2506
+ "learning_rate": 1.4217441032086537e-05,
2507
+ "loss": 1.4072,
2508
+ "step": 4050000
2509
+ },
2510
+ {
2511
+ "epoch": 7.18,
2512
+ "learning_rate": 1.4129079759373085e-05,
2513
+ "loss": 1.4085,
2514
+ "step": 4060000
2515
+ },
2516
+ {
2517
+ "epoch": 7.19,
2518
+ "learning_rate": 1.4040727325438539e-05,
2519
+ "loss": 1.4078,
2520
+ "step": 4070000
2521
+ },
2522
+ {
2523
+ "epoch": 7.21,
2524
+ "learning_rate": 1.3952383730282895e-05,
2525
+ "loss": 1.4066,
2526
+ "step": 4080000
2527
+ },
2528
+ {
2529
+ "epoch": 7.23,
2530
+ "learning_rate": 1.3864031296348349e-05,
2531
+ "loss": 1.4105,
2532
+ "step": 4090000
2533
+ },
2534
+ {
2535
+ "epoch": 7.25,
2536
+ "learning_rate": 1.3775678862413801e-05,
2537
+ "loss": 1.4116,
2538
+ "step": 4100000
2539
+ },
2540
+ {
2541
+ "epoch": 7.27,
2542
+ "learning_rate": 1.3687326428479253e-05,
2543
+ "loss": 1.4095,
2544
+ "step": 4110000
2545
+ },
2546
+ {
2547
+ "epoch": 7.28,
2548
+ "learning_rate": 1.3598973994544707e-05,
2549
+ "loss": 1.4104,
2550
+ "step": 4120000
2551
+ },
2552
+ {
2553
+ "epoch": 7.3,
2554
+ "learning_rate": 1.3510621560610159e-05,
2555
+ "loss": 1.4105,
2556
+ "step": 4130000
2557
+ },
2558
+ {
2559
+ "epoch": 7.32,
2560
+ "learning_rate": 1.3422269126675612e-05,
2561
+ "loss": 1.4104,
2562
+ "step": 4140000
2563
+ },
2564
+ {
2565
+ "epoch": 7.34,
2566
+ "learning_rate": 1.3333916692741066e-05,
2567
+ "loss": 1.4106,
2568
+ "step": 4150000
2569
+ },
2570
+ {
2571
+ "epoch": 7.35,
2572
+ "learning_rate": 1.3245564258806516e-05,
2573
+ "loss": 1.4114,
2574
+ "step": 4160000
2575
+ },
2576
+ {
2577
+ "epoch": 7.37,
2578
+ "learning_rate": 1.315721182487197e-05,
2579
+ "loss": 1.4108,
2580
+ "step": 4170000
2581
+ },
2582
+ {
2583
+ "epoch": 7.39,
2584
+ "learning_rate": 1.306886822971633e-05,
2585
+ "loss": 1.409,
2586
+ "step": 4180000
2587
+ },
2588
+ {
2589
+ "epoch": 7.41,
2590
+ "learning_rate": 1.298051579578178e-05,
2591
+ "loss": 1.4106,
2592
+ "step": 4190000
2593
+ },
2594
+ {
2595
+ "epoch": 7.42,
2596
+ "learning_rate": 1.2892163361847234e-05,
2597
+ "loss": 1.4118,
2598
+ "step": 4200000
2599
+ },
2600
+ {
2601
+ "epoch": 7.44,
2602
+ "learning_rate": 1.2803819766691594e-05,
2603
+ "loss": 1.4115,
2604
+ "step": 4210000
2605
+ },
2606
+ {
2607
+ "epoch": 7.46,
2608
+ "learning_rate": 1.2715467332757044e-05,
2609
+ "loss": 1.4135,
2610
+ "step": 4220000
2611
+ },
2612
+ {
2613
+ "epoch": 7.48,
2614
+ "learning_rate": 1.2627123737601404e-05,
2615
+ "loss": 1.4104,
2616
+ "step": 4230000
2617
+ },
2618
+ {
2619
+ "epoch": 7.5,
2620
+ "learning_rate": 1.2538771303666858e-05,
2621
+ "loss": 1.4124,
2622
+ "step": 4240000
2623
+ },
2624
+ {
2625
+ "epoch": 7.51,
2626
+ "learning_rate": 1.2450427708511214e-05,
2627
+ "loss": 1.4094,
2628
+ "step": 4250000
2629
+ },
2630
+ {
2631
+ "epoch": 7.53,
2632
+ "learning_rate": 1.2362075274576668e-05,
2633
+ "loss": 1.4058,
2634
+ "step": 4260000
2635
+ },
2636
+ {
2637
+ "epoch": 7.55,
2638
+ "learning_rate": 1.227372284064212e-05,
2639
+ "loss": 1.4108,
2640
+ "step": 4270000
2641
+ },
2642
+ {
2643
+ "epoch": 7.57,
2644
+ "learning_rate": 1.2185379245486478e-05,
2645
+ "loss": 1.411,
2646
+ "step": 4280000
2647
+ },
2648
+ {
2649
+ "epoch": 7.58,
2650
+ "learning_rate": 1.2097035650330837e-05,
2651
+ "loss": 1.4102,
2652
+ "step": 4290000
2653
+ },
2654
+ {
2655
+ "epoch": 7.6,
2656
+ "learning_rate": 1.200868321639629e-05,
2657
+ "loss": 1.4126,
2658
+ "step": 4300000
2659
+ },
2660
+ {
2661
+ "epoch": 7.62,
2662
+ "learning_rate": 1.1920330782461742e-05,
2663
+ "loss": 1.4104,
2664
+ "step": 4310000
2665
+ },
2666
+ {
2667
+ "epoch": 7.64,
2668
+ "learning_rate": 1.18319871873061e-05,
2669
+ "loss": 1.4127,
2670
+ "step": 4320000
2671
+ },
2672
+ {
2673
+ "epoch": 7.65,
2674
+ "learning_rate": 1.1743634753371553e-05,
2675
+ "loss": 1.4094,
2676
+ "step": 4330000
2677
+ },
2678
+ {
2679
+ "epoch": 7.67,
2680
+ "learning_rate": 1.165529115821591e-05,
2681
+ "loss": 1.4097,
2682
+ "step": 4340000
2683
+ },
2684
+ {
2685
+ "epoch": 7.69,
2686
+ "learning_rate": 1.1566938724281363e-05,
2687
+ "loss": 1.4095,
2688
+ "step": 4350000
2689
+ },
2690
+ {
2691
+ "epoch": 7.71,
2692
+ "learning_rate": 1.1478595129125721e-05,
2693
+ "loss": 1.4079,
2694
+ "step": 4360000
2695
+ },
2696
+ {
2697
+ "epoch": 7.73,
2698
+ "learning_rate": 1.1390242695191175e-05,
2699
+ "loss": 1.4127,
2700
+ "step": 4370000
2701
+ },
2702
+ {
2703
+ "epoch": 7.74,
2704
+ "learning_rate": 1.1301890261256627e-05,
2705
+ "loss": 1.4079,
2706
+ "step": 4380000
2707
+ },
2708
+ {
2709
+ "epoch": 7.76,
2710
+ "learning_rate": 1.1213546666100985e-05,
2711
+ "loss": 1.4065,
2712
+ "step": 4390000
2713
+ },
2714
+ {
2715
+ "epoch": 7.78,
2716
+ "learning_rate": 1.1125203070945343e-05,
2717
+ "loss": 1.4098,
2718
+ "step": 4400000
2719
+ },
2720
+ {
2721
+ "epoch": 7.8,
2722
+ "learning_rate": 1.1036850637010797e-05,
2723
+ "loss": 1.4123,
2724
+ "step": 4410000
2725
+ },
2726
+ {
2727
+ "epoch": 7.81,
2728
+ "learning_rate": 1.0948507041855153e-05,
2729
+ "loss": 1.409,
2730
+ "step": 4420000
2731
+ },
2732
+ {
2733
+ "epoch": 7.83,
2734
+ "learning_rate": 1.0860163446699512e-05,
2735
+ "loss": 1.4045,
2736
+ "step": 4430000
2737
+ },
2738
+ {
2739
+ "epoch": 7.85,
2740
+ "learning_rate": 1.0771811012764965e-05,
2741
+ "loss": 1.4102,
2742
+ "step": 4440000
2743
+ },
2744
+ {
2745
+ "epoch": 7.87,
2746
+ "learning_rate": 1.0683467417609323e-05,
2747
+ "loss": 1.4085,
2748
+ "step": 4450000
2749
+ },
2750
+ {
2751
+ "epoch": 7.88,
2752
+ "learning_rate": 1.0595114983674775e-05,
2753
+ "loss": 1.4038,
2754
+ "step": 4460000
2755
+ },
2756
+ {
2757
+ "epoch": 7.9,
2758
+ "learning_rate": 1.0506771388519134e-05,
2759
+ "loss": 1.4052,
2760
+ "step": 4470000
2761
+ },
2762
+ {
2763
+ "epoch": 7.92,
2764
+ "learning_rate": 1.0418427793363492e-05,
2765
+ "loss": 1.4094,
2766
+ "step": 4480000
2767
+ },
2768
+ {
2769
+ "epoch": 7.94,
2770
+ "learning_rate": 1.033008419820785e-05,
2771
+ "loss": 1.4071,
2772
+ "step": 4490000
2773
+ },
2774
+ {
2775
+ "epoch": 7.95,
2776
+ "learning_rate": 1.0241731764273304e-05,
2777
+ "loss": 1.4075,
2778
+ "step": 4500000
2779
+ },
2780
+ {
2781
+ "epoch": 7.97,
2782
+ "learning_rate": 1.0153379330338756e-05,
2783
+ "loss": 1.4047,
2784
+ "step": 4510000
2785
+ },
2786
+ {
2787
+ "epoch": 7.99,
2788
+ "learning_rate": 1.0065035735183114e-05,
2789
+ "loss": 1.4063,
2790
+ "step": 4520000
2791
+ },
2792
+ {
2793
+ "epoch": 8.0,
2794
+ "eval_bleu": 32.0886,
2795
+ "eval_gen_len": 65.8595,
2796
+ "eval_loss": 1.5665596723556519,
2797
+ "eval_runtime": 3002.5617,
2798
+ "eval_samples_per_second": 5.717,
2799
+ "eval_steps_per_second": 0.357,
2800
+ "step": 4525512
2801
+ },
2802
+ {
2803
+ "epoch": 8.01,
2804
+ "learning_rate": 9.976692140027472e-06,
2805
+ "loss": 1.3896,
2806
+ "step": 4530000
2807
+ },
2808
+ {
2809
+ "epoch": 8.03,
2810
+ "learning_rate": 9.88834854487183e-06,
2811
+ "loss": 1.3736,
2812
+ "step": 4540000
2813
+ },
2814
+ {
2815
+ "epoch": 8.04,
2816
+ "learning_rate": 9.799996110937282e-06,
2817
+ "loss": 1.3741,
2818
+ "step": 4550000
2819
+ },
2820
+ {
2821
+ "epoch": 8.06,
2822
+ "learning_rate": 9.71165251578164e-06,
2823
+ "loss": 1.3717,
2824
+ "step": 4560000
2825
+ },
2826
+ {
2827
+ "epoch": 8.08,
2828
+ "learning_rate": 9.623308920625997e-06,
2829
+ "loss": 1.3731,
2830
+ "step": 4570000
2831
+ },
2832
+ {
2833
+ "epoch": 8.1,
2834
+ "learning_rate": 9.534965325470355e-06,
2835
+ "loss": 1.375,
2836
+ "step": 4580000
2837
+ },
2838
+ {
2839
+ "epoch": 8.11,
2840
+ "learning_rate": 9.446612891535809e-06,
2841
+ "loss": 1.3764,
2842
+ "step": 4590000
2843
+ },
2844
+ {
2845
+ "epoch": 8.13,
2846
+ "learning_rate": 9.358269296380166e-06,
2847
+ "loss": 1.3752,
2848
+ "step": 4600000
2849
+ },
2850
+ {
2851
+ "epoch": 8.15,
2852
+ "learning_rate": 9.26991686244562e-06,
2853
+ "loss": 1.3768,
2854
+ "step": 4610000
2855
+ },
2856
+ {
2857
+ "epoch": 8.17,
2858
+ "learning_rate": 9.181573267289978e-06,
2859
+ "loss": 1.3767,
2860
+ "step": 4620000
2861
+ },
2862
+ {
2863
+ "epoch": 8.18,
2864
+ "learning_rate": 9.093229672134336e-06,
2865
+ "loss": 1.3769,
2866
+ "step": 4630000
2867
+ },
2868
+ {
2869
+ "epoch": 8.2,
2870
+ "learning_rate": 9.004886076978694e-06,
2871
+ "loss": 1.3772,
2872
+ "step": 4640000
2873
+ },
2874
+ {
2875
+ "epoch": 8.22,
2876
+ "learning_rate": 8.916533643044146e-06,
2877
+ "loss": 1.3766,
2878
+ "step": 4650000
2879
+ },
2880
+ {
2881
+ "epoch": 8.24,
2882
+ "learning_rate": 8.828190047888504e-06,
2883
+ "loss": 1.38,
2884
+ "step": 4660000
2885
+ },
2886
+ {
2887
+ "epoch": 8.26,
2888
+ "learning_rate": 8.739846452732862e-06,
2889
+ "loss": 1.3764,
2890
+ "step": 4670000
2891
+ },
2892
+ {
2893
+ "epoch": 8.27,
2894
+ "learning_rate": 8.65150285757722e-06,
2895
+ "loss": 1.3765,
2896
+ "step": 4680000
2897
+ },
2898
+ {
2899
+ "epoch": 8.29,
2900
+ "learning_rate": 8.563150423642673e-06,
2901
+ "loss": 1.3765,
2902
+ "step": 4690000
2903
+ },
2904
+ {
2905
+ "epoch": 8.31,
2906
+ "learning_rate": 8.47480682848703e-06,
2907
+ "loss": 1.3756,
2908
+ "step": 4700000
2909
+ },
2910
+ {
2911
+ "epoch": 8.33,
2912
+ "learning_rate": 8.386463233331389e-06,
2913
+ "loss": 1.3781,
2914
+ "step": 4710000
2915
+ },
2916
+ {
2917
+ "epoch": 8.34,
2918
+ "learning_rate": 8.298110799396843e-06,
2919
+ "loss": 1.3788,
2920
+ "step": 4720000
2921
+ },
2922
+ {
2923
+ "epoch": 8.36,
2924
+ "learning_rate": 8.209767204241201e-06,
2925
+ "loss": 1.3759,
2926
+ "step": 4730000
2927
+ },
2928
+ {
2929
+ "epoch": 8.38,
2930
+ "learning_rate": 8.121423609085559e-06,
2931
+ "loss": 1.3783,
2932
+ "step": 4740000
2933
+ },
2934
+ {
2935
+ "epoch": 8.4,
2936
+ "learning_rate": 8.033080013929916e-06,
2937
+ "loss": 1.3782,
2938
+ "step": 4750000
2939
+ },
2940
+ {
2941
+ "epoch": 8.41,
2942
+ "learning_rate": 7.944736418774274e-06,
2943
+ "loss": 1.3792,
2944
+ "step": 4760000
2945
+ },
2946
+ {
2947
+ "epoch": 8.43,
2948
+ "learning_rate": 7.856383984839727e-06,
2949
+ "loss": 1.3775,
2950
+ "step": 4770000
2951
+ },
2952
+ {
2953
+ "epoch": 8.45,
2954
+ "learning_rate": 7.768040389684086e-06,
2955
+ "loss": 1.3779,
2956
+ "step": 4780000
2957
+ },
2958
+ {
2959
+ "epoch": 8.47,
2960
+ "learning_rate": 7.679696794528444e-06,
2961
+ "loss": 1.3797,
2962
+ "step": 4790000
2963
+ },
2964
+ {
2965
+ "epoch": 8.49,
2966
+ "learning_rate": 7.591353199372801e-06,
2967
+ "loss": 1.3761,
2968
+ "step": 4800000
2969
+ },
2970
+ {
2971
+ "epoch": 8.5,
2972
+ "learning_rate": 7.503009604217158e-06,
2973
+ "loss": 1.3784,
2974
+ "step": 4810000
2975
+ },
2976
+ {
2977
+ "epoch": 8.52,
2978
+ "learning_rate": 7.414666009061516e-06,
2979
+ "loss": 1.3769,
2980
+ "step": 4820000
2981
+ },
2982
+ {
2983
+ "epoch": 8.54,
2984
+ "learning_rate": 7.32631357512697e-06,
2985
+ "loss": 1.3764,
2986
+ "step": 4830000
2987
+ },
2988
+ {
2989
+ "epoch": 8.56,
2990
+ "learning_rate": 7.237969979971328e-06,
2991
+ "loss": 1.3818,
2992
+ "step": 4840000
2993
+ },
2994
+ {
2995
+ "epoch": 8.57,
2996
+ "learning_rate": 7.149626384815686e-06,
2997
+ "loss": 1.3787,
2998
+ "step": 4850000
2999
+ },
3000
+ {
3001
+ "epoch": 8.59,
3002
+ "learning_rate": 7.061282789660044e-06,
3003
+ "loss": 1.3762,
3004
+ "step": 4860000
3005
+ },
3006
+ {
3007
+ "epoch": 8.61,
3008
+ "learning_rate": 6.972939194504401e-06,
3009
+ "loss": 1.3788,
3010
+ "step": 4870000
3011
+ },
3012
+ {
3013
+ "epoch": 8.63,
3014
+ "learning_rate": 6.884595599348759e-06,
3015
+ "loss": 1.3752,
3016
+ "step": 4880000
3017
+ },
3018
+ {
3019
+ "epoch": 8.64,
3020
+ "learning_rate": 6.796243165414212e-06,
3021
+ "loss": 1.3771,
3022
+ "step": 4890000
3023
+ },
3024
+ {
3025
+ "epoch": 8.66,
3026
+ "learning_rate": 6.70789957025857e-06,
3027
+ "loss": 1.3785,
3028
+ "step": 4900000
3029
+ },
3030
+ {
3031
+ "epoch": 8.68,
3032
+ "learning_rate": 6.619555975102928e-06,
3033
+ "loss": 1.3746,
3034
+ "step": 4910000
3035
+ },
3036
+ {
3037
+ "epoch": 8.7,
3038
+ "learning_rate": 6.531212379947286e-06,
3039
+ "loss": 1.3769,
3040
+ "step": 4920000
3041
+ },
3042
+ {
3043
+ "epoch": 8.72,
3044
+ "learning_rate": 6.4428599460127385e-06,
3045
+ "loss": 1.3781,
3046
+ "step": 4930000
3047
+ },
3048
+ {
3049
+ "epoch": 8.73,
3050
+ "learning_rate": 6.354516350857097e-06,
3051
+ "loss": 1.3756,
3052
+ "step": 4940000
3053
+ },
3054
+ {
3055
+ "epoch": 8.75,
3056
+ "learning_rate": 6.266172755701455e-06,
3057
+ "loss": 1.3761,
3058
+ "step": 4950000
3059
+ },
3060
+ {
3061
+ "epoch": 8.77,
3062
+ "learning_rate": 6.177829160545812e-06,
3063
+ "loss": 1.3762,
3064
+ "step": 4960000
3065
+ },
3066
+ {
3067
+ "epoch": 8.79,
3068
+ "learning_rate": 6.089476726611266e-06,
3069
+ "loss": 1.3754,
3070
+ "step": 4970000
3071
+ },
3072
+ {
3073
+ "epoch": 8.8,
3074
+ "learning_rate": 6.001133131455623e-06,
3075
+ "loss": 1.3728,
3076
+ "step": 4980000
3077
+ },
3078
+ {
3079
+ "epoch": 8.82,
3080
+ "learning_rate": 5.9127895362999815e-06,
3081
+ "loss": 1.3759,
3082
+ "step": 4990000
3083
+ },
3084
+ {
3085
+ "epoch": 8.84,
3086
+ "learning_rate": 5.82444594114434e-06,
3087
+ "loss": 1.374,
3088
+ "step": 5000000
3089
+ },
3090
+ {
3091
+ "epoch": 8.86,
3092
+ "learning_rate": 5.736102345988698e-06,
3093
+ "loss": 1.3737,
3094
+ "step": 5010000
3095
+ },
3096
+ {
3097
+ "epoch": 8.87,
3098
+ "learning_rate": 5.647758750833055e-06,
3099
+ "loss": 1.3749,
3100
+ "step": 5020000
3101
+ },
3102
+ {
3103
+ "epoch": 8.89,
3104
+ "learning_rate": 5.559406316898508e-06,
3105
+ "loss": 1.3757,
3106
+ "step": 5030000
3107
+ },
3108
+ {
3109
+ "epoch": 8.91,
3110
+ "learning_rate": 5.4710627217428655e-06,
3111
+ "loss": 1.376,
3112
+ "step": 5040000
3113
+ },
3114
+ {
3115
+ "epoch": 8.93,
3116
+ "learning_rate": 5.382719126587224e-06,
3117
+ "loss": 1.3753,
3118
+ "step": 5050000
3119
+ },
3120
+ {
3121
+ "epoch": 8.94,
3122
+ "learning_rate": 5.294375531431582e-06,
3123
+ "loss": 1.3763,
3124
+ "step": 5060000
3125
+ },
3126
+ {
3127
+ "epoch": 8.96,
3128
+ "learning_rate": 5.206031936275939e-06,
3129
+ "loss": 1.3731,
3130
+ "step": 5070000
3131
+ },
3132
+ {
3133
+ "epoch": 8.98,
3134
+ "learning_rate": 5.117679502341393e-06,
3135
+ "loss": 1.3711,
3136
+ "step": 5080000
3137
+ },
3138
+ {
3139
+ "epoch": 9.0,
3140
+ "learning_rate": 5.02933590718575e-06,
3141
+ "loss": 1.3724,
3142
+ "step": 5090000
3143
+ },
3144
+ {
3145
+ "epoch": 9.0,
3146
+ "eval_bleu": 32.3644,
3147
+ "eval_gen_len": 66.1648,
3148
+ "eval_loss": 1.5537199974060059,
3149
+ "eval_runtime": 3034.8877,
3150
+ "eval_samples_per_second": 5.656,
3151
+ "eval_steps_per_second": 0.354,
3152
+ "step": 5091201
3153
+ },
3154
+ {
3155
+ "epoch": 9.02,
3156
+ "learning_rate": 4.9409923120301085e-06,
3157
+ "loss": 1.3485,
3158
+ "step": 5100000
3159
+ },
3160
+ {
3161
+ "epoch": 9.03,
3162
+ "learning_rate": 4.852648716874467e-06,
3163
+ "loss": 1.3453,
3164
+ "step": 5110000
3165
+ },
3166
+ {
3167
+ "epoch": 9.05,
3168
+ "learning_rate": 4.764305121718825e-06,
3169
+ "loss": 1.3439,
3170
+ "step": 5120000
3171
+ },
3172
+ {
3173
+ "epoch": 9.07,
3174
+ "learning_rate": 4.675961526563182e-06,
3175
+ "loss": 1.3475,
3176
+ "step": 5130000
3177
+ },
3178
+ {
3179
+ "epoch": 9.09,
3180
+ "learning_rate": 4.5876179314075404e-06,
3181
+ "loss": 1.3443,
3182
+ "step": 5140000
3183
+ },
3184
+ {
3185
+ "epoch": 9.1,
3186
+ "learning_rate": 4.499265497472993e-06,
3187
+ "loss": 1.3456,
3188
+ "step": 5150000
3189
+ },
3190
+ {
3191
+ "epoch": 9.12,
3192
+ "learning_rate": 4.4109219023173515e-06,
3193
+ "loss": 1.3474,
3194
+ "step": 5160000
3195
+ },
3196
+ {
3197
+ "epoch": 9.14,
3198
+ "learning_rate": 4.32257830716171e-06,
3199
+ "loss": 1.3491,
3200
+ "step": 5170000
3201
+ },
3202
+ {
3203
+ "epoch": 9.16,
3204
+ "learning_rate": 4.234234712006067e-06,
3205
+ "loss": 1.3491,
3206
+ "step": 5180000
3207
+ },
3208
+ {
3209
+ "epoch": 9.17,
3210
+ "learning_rate": 4.14588227807152e-06,
3211
+ "loss": 1.3493,
3212
+ "step": 5190000
3213
+ },
3214
+ {
3215
+ "epoch": 9.19,
3216
+ "learning_rate": 4.057538682915878e-06,
3217
+ "loss": 1.3485,
3218
+ "step": 5200000
3219
+ },
3220
+ {
3221
+ "epoch": 9.21,
3222
+ "learning_rate": 3.9691950877602355e-06,
3223
+ "loss": 1.3506,
3224
+ "step": 5210000
3225
+ },
3226
+ {
3227
+ "epoch": 9.23,
3228
+ "learning_rate": 3.880842653825688e-06,
3229
+ "loss": 1.3498,
3230
+ "step": 5220000
3231
+ },
3232
+ {
3233
+ "epoch": 9.25,
3234
+ "learning_rate": 3.7924990586700466e-06,
3235
+ "loss": 1.3453,
3236
+ "step": 5230000
3237
+ },
3238
+ {
3239
+ "epoch": 9.26,
3240
+ "learning_rate": 3.7041554635144048e-06,
3241
+ "loss": 1.3475,
3242
+ "step": 5240000
3243
+ },
3244
+ {
3245
+ "epoch": 9.28,
3246
+ "learning_rate": 3.615811868358763e-06,
3247
+ "loss": 1.3469,
3248
+ "step": 5250000
3249
+ },
3250
+ {
3251
+ "epoch": 9.3,
3252
+ "learning_rate": 3.5274682732031203e-06,
3253
+ "loss": 1.3477,
3254
+ "step": 5260000
3255
+ },
3256
+ {
3257
+ "epoch": 9.32,
3258
+ "learning_rate": 3.4391246780474785e-06,
3259
+ "loss": 1.3443,
3260
+ "step": 5270000
3261
+ },
3262
+ {
3263
+ "epoch": 9.33,
3264
+ "learning_rate": 3.3507810828918367e-06,
3265
+ "loss": 1.3478,
3266
+ "step": 5280000
3267
+ },
3268
+ {
3269
+ "epoch": 9.35,
3270
+ "learning_rate": 3.2624374877361945e-06,
3271
+ "loss": 1.3473,
3272
+ "step": 5290000
3273
+ },
3274
+ {
3275
+ "epoch": 9.37,
3276
+ "learning_rate": 3.1740850538016474e-06,
3277
+ "loss": 1.349,
3278
+ "step": 5300000
3279
+ },
3280
+ {
3281
+ "epoch": 9.39,
3282
+ "learning_rate": 3.085741458646005e-06,
3283
+ "loss": 1.3485,
3284
+ "step": 5310000
3285
+ },
3286
+ {
3287
+ "epoch": 9.4,
3288
+ "learning_rate": 2.9973978634903633e-06,
3289
+ "loss": 1.349,
3290
+ "step": 5320000
3291
+ },
3292
+ {
3293
+ "epoch": 9.42,
3294
+ "learning_rate": 2.909054268334721e-06,
3295
+ "loss": 1.3478,
3296
+ "step": 5330000
3297
+ },
3298
+ {
3299
+ "epoch": 9.44,
3300
+ "learning_rate": 2.8207106731790793e-06,
3301
+ "loss": 1.3489,
3302
+ "step": 5340000
3303
+ },
3304
+ {
3305
+ "epoch": 9.46,
3306
+ "learning_rate": 2.732367078023437e-06,
3307
+ "loss": 1.3446,
3308
+ "step": 5350000
3309
+ },
3310
+ {
3311
+ "epoch": 9.48,
3312
+ "learning_rate": 2.644023482867795e-06,
3313
+ "loss": 1.3478,
3314
+ "step": 5360000
3315
+ },
3316
+ {
3317
+ "epoch": 9.49,
3318
+ "learning_rate": 2.5556798877121526e-06,
3319
+ "loss": 1.3491,
3320
+ "step": 5370000
3321
+ },
3322
+ {
3323
+ "epoch": 9.51,
3324
+ "learning_rate": 2.467336292556511e-06,
3325
+ "loss": 1.3467,
3326
+ "step": 5380000
3327
+ },
3328
+ {
3329
+ "epoch": 9.53,
3330
+ "learning_rate": 2.3789926974008686e-06,
3331
+ "loss": 1.3492,
3332
+ "step": 5390000
3333
+ },
3334
+ {
3335
+ "epoch": 9.55,
3336
+ "learning_rate": 2.290640263466322e-06,
3337
+ "loss": 1.3472,
3338
+ "step": 5400000
3339
+ },
3340
+ {
3341
+ "epoch": 9.56,
3342
+ "learning_rate": 2.2022966683106797e-06,
3343
+ "loss": 1.3439,
3344
+ "step": 5410000
3345
+ },
3346
+ {
3347
+ "epoch": 9.58,
3348
+ "learning_rate": 2.113953073155038e-06,
3349
+ "loss": 1.3474,
3350
+ "step": 5420000
3351
+ },
3352
+ {
3353
+ "epoch": 9.6,
3354
+ "learning_rate": 2.0256094779993956e-06,
3355
+ "loss": 1.3459,
3356
+ "step": 5430000
3357
+ },
3358
+ {
3359
+ "epoch": 9.62,
3360
+ "learning_rate": 1.9372658828437534e-06,
3361
+ "loss": 1.3485,
3362
+ "step": 5440000
3363
+ },
3364
+ {
3365
+ "epoch": 9.63,
3366
+ "learning_rate": 1.8489134489092063e-06,
3367
+ "loss": 1.3438,
3368
+ "step": 5450000
3369
+ },
3370
+ {
3371
+ "epoch": 9.65,
3372
+ "learning_rate": 1.7605698537535645e-06,
3373
+ "loss": 1.3449,
3374
+ "step": 5460000
3375
+ },
3376
+ {
3377
+ "epoch": 9.67,
3378
+ "learning_rate": 1.6722262585979223e-06,
3379
+ "loss": 1.3472,
3380
+ "step": 5470000
3381
+ },
3382
+ {
3383
+ "epoch": 9.69,
3384
+ "learning_rate": 1.5838826634422803e-06,
3385
+ "loss": 1.3476,
3386
+ "step": 5480000
3387
+ },
3388
+ {
3389
+ "epoch": 9.7,
3390
+ "learning_rate": 1.495539068286638e-06,
3391
+ "loss": 1.3462,
3392
+ "step": 5490000
3393
+ },
3394
+ {
3395
+ "epoch": 9.72,
3396
+ "learning_rate": 1.407195473130996e-06,
3397
+ "loss": 1.3479,
3398
+ "step": 5500000
3399
+ },
3400
+ {
3401
+ "epoch": 9.74,
3402
+ "learning_rate": 1.318851877975354e-06,
3403
+ "loss": 1.3441,
3404
+ "step": 5510000
3405
+ },
3406
+ {
3407
+ "epoch": 9.76,
3408
+ "learning_rate": 1.2304994440408069e-06,
3409
+ "loss": 1.3439,
3410
+ "step": 5520000
3411
+ },
3412
+ {
3413
+ "epoch": 9.78,
3414
+ "learning_rate": 1.1421558488851649e-06,
3415
+ "loss": 1.3459,
3416
+ "step": 5530000
3417
+ },
3418
+ {
3419
+ "epoch": 9.79,
3420
+ "learning_rate": 1.0538122537295229e-06,
3421
+ "loss": 1.3443,
3422
+ "step": 5540000
3423
+ },
3424
+ {
3425
+ "epoch": 9.81,
3426
+ "learning_rate": 9.654686585738808e-07,
3427
+ "loss": 1.3455,
3428
+ "step": 5550000
3429
+ },
3430
+ {
3431
+ "epoch": 9.83,
3432
+ "learning_rate": 8.771162246393337e-07,
3433
+ "loss": 1.3448,
3434
+ "step": 5560000
3435
+ },
3436
+ {
3437
+ "epoch": 9.85,
3438
+ "learning_rate": 7.887726294836917e-07,
3439
+ "loss": 1.3432,
3440
+ "step": 5570000
3441
+ },
3442
+ {
3443
+ "epoch": 9.86,
3444
+ "learning_rate": 7.004290343280496e-07,
3445
+ "loss": 1.3444,
3446
+ "step": 5580000
3447
+ },
3448
+ {
3449
+ "epoch": 9.88,
3450
+ "learning_rate": 6.120854391724075e-07,
3451
+ "loss": 1.3432,
3452
+ "step": 5590000
3453
+ },
3454
+ {
3455
+ "epoch": 9.9,
3456
+ "learning_rate": 5.237418440167655e-07,
3457
+ "loss": 1.3454,
3458
+ "step": 5600000
3459
+ },
3460
+ {
3461
+ "epoch": 9.92,
3462
+ "learning_rate": 4.353894100822184e-07,
3463
+ "loss": 1.3437,
3464
+ "step": 5610000
3465
+ },
3466
+ {
3467
+ "epoch": 9.93,
3468
+ "learning_rate": 3.4704581492657626e-07,
3469
+ "loss": 1.3447,
3470
+ "step": 5620000
3471
+ },
3472
+ {
3473
+ "epoch": 9.95,
3474
+ "learning_rate": 2.587022197709342e-07,
3475
+ "loss": 1.3441,
3476
+ "step": 5630000
3477
+ },
3478
+ {
3479
+ "epoch": 9.97,
3480
+ "learning_rate": 1.7035862461529215e-07,
3481
+ "loss": 1.3426,
3482
+ "step": 5640000
3483
+ },
3484
+ {
3485
+ "epoch": 9.99,
3486
+ "learning_rate": 8.201502945965009e-08,
3487
+ "loss": 1.3452,
3488
+ "step": 5650000
3489
+ },
3490
+ {
3491
+ "epoch": 10.0,
3492
+ "eval_bleu": 32.4724,
3493
+ "eval_gen_len": 66.1539,
3494
+ "eval_loss": 1.5473366975784302,
3495
+ "eval_runtime": 3064.956,
3496
+ "eval_samples_per_second": 5.6,
3497
+ "eval_steps_per_second": 0.35,
3498
+ "step": 5656890
3499
+ },
3500
+ {
3501
+ "epoch": 10.0,
3502
+ "step": 5656890,
3503
+ "total_flos": 1.8656478019360383e+19,
3504
+ "train_loss": 1.5652431253911159,
3505
+ "train_runtime": 1147827.3858,
3506
+ "train_samples_per_second": 78.854,
3507
+ "train_steps_per_second": 4.928
3508
+ }
3509
+ ],
3510
+ "max_steps": 5656890,
3511
+ "num_train_epochs": 10,
3512
+ "total_flos": 1.8656478019360383e+19,
3513
+ "trial_name": null,
3514
+ "trial_params": null
3515
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1058664b83b63814b24f25a6a472259603e0877b678b12ed0ace9f098ec5e19d
3
+ size 2991
vocab.json ADDED
The diff for this file is too large to render. See raw diff