potamides commited on
Commit
62e4c19
1 Parent(s): f664469

add model files

Browse files
config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/byt5-base",
3
+ "architectures": [
4
+ "ByGPT5LMHeadModel"
5
+ ],
6
+ "d_ff": 3968,
7
+ "d_kv": 64,
8
+ "d_model": 1536,
9
+ "decoder_start_token_id": 0,
10
+ "dense_act_fn": "gelu_new",
11
+ "dropout_rate": 0.1,
12
+ "eos_token_id": 1,
13
+ "feed_forward_proj": "gated-gelu",
14
+ "gradient_checkpointing": false,
15
+ "initializer_factor": 1.0,
16
+ "is_decoder": true,
17
+ "is_encoder_decoder": false,
18
+ "is_gated_act": true,
19
+ "layer_norm_epsilon": 1e-06,
20
+ "model_type": "bygpt5",
21
+ "num_decoder_layers": 6,
22
+ "num_heads": 12,
23
+ "num_layers": 6,
24
+ "output_past": true,
25
+ "pad_token_id": 0,
26
+ "relative_attention_max_distance": 128,
27
+ "relative_attention_num_buckets": 32,
28
+ "tie_word_embeddings": false,
29
+ "tokenizer_class": "ByT5Tokenizer",
30
+ "torch_dtype": "float32",
31
+ "transformers_version": "4.20.0",
32
+ "use_cache": true,
33
+ "vocab_size": 384
34
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f57f72aead92ed0674bad62f6c55f85db6ec8fe8ff6956286a475817c401b6b
3
+ size 556894071
special_tokens_map.json ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<extra_id_0>",
4
+ "<extra_id_1>",
5
+ "<extra_id_2>",
6
+ "<extra_id_3>",
7
+ "<extra_id_4>",
8
+ "<extra_id_5>",
9
+ "<extra_id_6>",
10
+ "<extra_id_7>",
11
+ "<extra_id_8>",
12
+ "<extra_id_9>",
13
+ "<extra_id_10>",
14
+ "<extra_id_11>",
15
+ "<extra_id_12>",
16
+ "<extra_id_13>",
17
+ "<extra_id_14>",
18
+ "<extra_id_15>",
19
+ "<extra_id_16>",
20
+ "<extra_id_17>",
21
+ "<extra_id_18>",
22
+ "<extra_id_19>",
23
+ "<extra_id_20>",
24
+ "<extra_id_21>",
25
+ "<extra_id_22>",
26
+ "<extra_id_23>",
27
+ "<extra_id_24>",
28
+ "<extra_id_25>",
29
+ "<extra_id_26>",
30
+ "<extra_id_27>",
31
+ "<extra_id_28>",
32
+ "<extra_id_29>",
33
+ "<extra_id_30>",
34
+ "<extra_id_31>",
35
+ "<extra_id_32>",
36
+ "<extra_id_33>",
37
+ "<extra_id_34>",
38
+ "<extra_id_35>",
39
+ "<extra_id_36>",
40
+ "<extra_id_37>",
41
+ "<extra_id_38>",
42
+ "<extra_id_39>",
43
+ "<extra_id_40>",
44
+ "<extra_id_41>",
45
+ "<extra_id_42>",
46
+ "<extra_id_43>",
47
+ "<extra_id_44>",
48
+ "<extra_id_45>",
49
+ "<extra_id_46>",
50
+ "<extra_id_47>",
51
+ "<extra_id_48>",
52
+ "<extra_id_49>",
53
+ "<extra_id_50>",
54
+ "<extra_id_51>",
55
+ "<extra_id_52>",
56
+ "<extra_id_53>",
57
+ "<extra_id_54>",
58
+ "<extra_id_55>",
59
+ "<extra_id_56>",
60
+ "<extra_id_57>",
61
+ "<extra_id_58>",
62
+ "<extra_id_59>",
63
+ "<extra_id_60>",
64
+ "<extra_id_61>",
65
+ "<extra_id_62>",
66
+ "<extra_id_63>",
67
+ "<extra_id_64>",
68
+ "<extra_id_65>",
69
+ "<extra_id_66>",
70
+ "<extra_id_67>",
71
+ "<extra_id_68>",
72
+ "<extra_id_69>",
73
+ "<extra_id_70>",
74
+ "<extra_id_71>",
75
+ "<extra_id_72>",
76
+ "<extra_id_73>",
77
+ "<extra_id_74>",
78
+ "<extra_id_75>",
79
+ "<extra_id_76>",
80
+ "<extra_id_77>",
81
+ "<extra_id_78>",
82
+ "<extra_id_79>",
83
+ "<extra_id_80>",
84
+ "<extra_id_81>",
85
+ "<extra_id_82>",
86
+ "<extra_id_83>",
87
+ "<extra_id_84>",
88
+ "<extra_id_85>",
89
+ "<extra_id_86>",
90
+ "<extra_id_87>",
91
+ "<extra_id_88>",
92
+ "<extra_id_89>",
93
+ "<extra_id_90>",
94
+ "<extra_id_91>",
95
+ "<extra_id_92>",
96
+ "<extra_id_93>",
97
+ "<extra_id_94>",
98
+ "<extra_id_95>",
99
+ "<extra_id_96>",
100
+ "<extra_id_97>",
101
+ "<extra_id_98>",
102
+ "<extra_id_99>",
103
+ "<extra_id_100>",
104
+ "<extra_id_101>",
105
+ "<extra_id_102>",
106
+ "<extra_id_103>",
107
+ "<extra_id_104>",
108
+ "<extra_id_105>",
109
+ "<extra_id_106>",
110
+ "<extra_id_107>",
111
+ "<extra_id_108>",
112
+ "<extra_id_109>",
113
+ "<extra_id_110>",
114
+ "<extra_id_111>",
115
+ "<extra_id_112>",
116
+ "<extra_id_113>",
117
+ "<extra_id_114>",
118
+ "<extra_id_115>",
119
+ "<extra_id_116>",
120
+ "<extra_id_117>",
121
+ "<extra_id_118>",
122
+ "<extra_id_119>",
123
+ "<extra_id_120>",
124
+ "<extra_id_121>",
125
+ "<extra_id_122>",
126
+ "<extra_id_123>",
127
+ "<extra_id_124>"
128
+ ],
129
+ "eos_token": {
130
+ "content": "</s>",
131
+ "lstrip": false,
132
+ "normalized": true,
133
+ "rstrip": false,
134
+ "single_word": false
135
+ },
136
+ "pad_token": {
137
+ "content": "<pad>",
138
+ "lstrip": false,
139
+ "normalized": true,
140
+ "rstrip": false,
141
+ "single_word": false
142
+ },
143
+ "unk_token": {
144
+ "content": "<unk>",
145
+ "lstrip": false,
146
+ "normalized": true,
147
+ "rstrip": false,
148
+ "single_word": false
149
+ }
150
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": false,
5
+ "additional_special_tokens": [
6
+ "<extra_id_0>",
7
+ "<extra_id_1>",
8
+ "<extra_id_2>",
9
+ "<extra_id_3>",
10
+ "<extra_id_4>",
11
+ "<extra_id_5>",
12
+ "<extra_id_6>",
13
+ "<extra_id_7>",
14
+ "<extra_id_8>",
15
+ "<extra_id_9>",
16
+ "<extra_id_10>",
17
+ "<extra_id_11>",
18
+ "<extra_id_12>",
19
+ "<extra_id_13>",
20
+ "<extra_id_14>",
21
+ "<extra_id_15>",
22
+ "<extra_id_16>",
23
+ "<extra_id_17>",
24
+ "<extra_id_18>",
25
+ "<extra_id_19>",
26
+ "<extra_id_20>",
27
+ "<extra_id_21>",
28
+ "<extra_id_22>",
29
+ "<extra_id_23>",
30
+ "<extra_id_24>",
31
+ "<extra_id_25>",
32
+ "<extra_id_26>",
33
+ "<extra_id_27>",
34
+ "<extra_id_28>",
35
+ "<extra_id_29>",
36
+ "<extra_id_30>",
37
+ "<extra_id_31>",
38
+ "<extra_id_32>",
39
+ "<extra_id_33>",
40
+ "<extra_id_34>",
41
+ "<extra_id_35>",
42
+ "<extra_id_36>",
43
+ "<extra_id_37>",
44
+ "<extra_id_38>",
45
+ "<extra_id_39>",
46
+ "<extra_id_40>",
47
+ "<extra_id_41>",
48
+ "<extra_id_42>",
49
+ "<extra_id_43>",
50
+ "<extra_id_44>",
51
+ "<extra_id_45>",
52
+ "<extra_id_46>",
53
+ "<extra_id_47>",
54
+ "<extra_id_48>",
55
+ "<extra_id_49>",
56
+ "<extra_id_50>",
57
+ "<extra_id_51>",
58
+ "<extra_id_52>",
59
+ "<extra_id_53>",
60
+ "<extra_id_54>",
61
+ "<extra_id_55>",
62
+ "<extra_id_56>",
63
+ "<extra_id_57>",
64
+ "<extra_id_58>",
65
+ "<extra_id_59>",
66
+ "<extra_id_60>",
67
+ "<extra_id_61>",
68
+ "<extra_id_62>",
69
+ "<extra_id_63>",
70
+ "<extra_id_64>",
71
+ "<extra_id_65>",
72
+ "<extra_id_66>",
73
+ "<extra_id_67>",
74
+ "<extra_id_68>",
75
+ "<extra_id_69>",
76
+ "<extra_id_70>",
77
+ "<extra_id_71>",
78
+ "<extra_id_72>",
79
+ "<extra_id_73>",
80
+ "<extra_id_74>",
81
+ "<extra_id_75>",
82
+ "<extra_id_76>",
83
+ "<extra_id_77>",
84
+ "<extra_id_78>",
85
+ "<extra_id_79>",
86
+ "<extra_id_80>",
87
+ "<extra_id_81>",
88
+ "<extra_id_82>",
89
+ "<extra_id_83>",
90
+ "<extra_id_84>",
91
+ "<extra_id_85>",
92
+ "<extra_id_86>",
93
+ "<extra_id_87>",
94
+ "<extra_id_88>",
95
+ "<extra_id_89>",
96
+ "<extra_id_90>",
97
+ "<extra_id_91>",
98
+ "<extra_id_92>",
99
+ "<extra_id_93>",
100
+ "<extra_id_94>",
101
+ "<extra_id_95>",
102
+ "<extra_id_96>",
103
+ "<extra_id_97>",
104
+ "<extra_id_98>",
105
+ "<extra_id_99>",
106
+ "<extra_id_100>",
107
+ "<extra_id_101>",
108
+ "<extra_id_102>",
109
+ "<extra_id_103>",
110
+ "<extra_id_104>",
111
+ "<extra_id_105>",
112
+ "<extra_id_106>",
113
+ "<extra_id_107>",
114
+ "<extra_id_108>",
115
+ "<extra_id_109>",
116
+ "<extra_id_110>",
117
+ "<extra_id_111>",
118
+ "<extra_id_112>",
119
+ "<extra_id_113>",
120
+ "<extra_id_114>",
121
+ "<extra_id_115>",
122
+ "<extra_id_116>",
123
+ "<extra_id_117>",
124
+ "<extra_id_118>",
125
+ "<extra_id_119>",
126
+ "<extra_id_120>",
127
+ "<extra_id_121>",
128
+ "<extra_id_122>",
129
+ "<extra_id_123>",
130
+ "<extra_id_124>"
131
+ ],
132
+ "eos_token": {
133
+ "__type": "AddedToken",
134
+ "content": "</s>",
135
+ "lstrip": false,
136
+ "normalized": true,
137
+ "rstrip": false,
138
+ "single_word": false
139
+ },
140
+ "extra_ids": 125,
141
+ "name_or_path": "google/byt5-base",
142
+ "pad_token": {
143
+ "__type": "AddedToken",
144
+ "content": "<pad>",
145
+ "lstrip": false,
146
+ "normalized": true,
147
+ "rstrip": false,
148
+ "single_word": false
149
+ },
150
+ "special_tokens_map_file": "/home/jbelouadi/.cache/huggingface/transformers/f22e687c418a9ed3e651ca340d6a5880bbb312bc24bbe893f5cd47288891b89d.063895353d5ef9e19a25220cb616c43abc5e84a2f11b1ffb71c29e097572a109",
151
+ "tokenizer_class": "ByGPT5Tokenizer",
152
+ "unk_token": {
153
+ "__type": "AddedToken",
154
+ "content": "<unk>",
155
+ "lstrip": false,
156
+ "normalized": true,
157
+ "rstrip": false,
158
+ "single_word": false
159
+ }
160
+ }
trainer_state.json ADDED
@@ -0,0 +1,1391 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.9999762505640492,
5
+ "global_step": 50000,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.0,
12
+ "learning_rate": 1.2e-06,
13
+ "loss": 12.4223,
14
+ "step": 1
15
+ },
16
+ {
17
+ "epoch": 0.0,
18
+ "learning_rate": 0.0003,
19
+ "loss": 2.6675,
20
+ "step": 250
21
+ },
22
+ {
23
+ "epoch": 0.01,
24
+ "learning_rate": 0.0006,
25
+ "loss": 1.2324,
26
+ "step": 500
27
+ },
28
+ {
29
+ "epoch": 0.01,
30
+ "learning_rate": 0.0005999622383021625,
31
+ "loss": 1.1509,
32
+ "step": 750
33
+ },
34
+ {
35
+ "epoch": 0.02,
36
+ "learning_rate": 0.0005998489627149555,
37
+ "loss": 1.1126,
38
+ "step": 1000
39
+ },
40
+ {
41
+ "epoch": 0.02,
42
+ "learning_rate": 0.0005996602017549024,
43
+ "loss": 1.0883,
44
+ "step": 1250
45
+ },
46
+ {
47
+ "epoch": 0.03,
48
+ "learning_rate": 0.0005993960029415653,
49
+ "loss": 1.0704,
50
+ "step": 1500
51
+ },
52
+ {
53
+ "epoch": 0.03,
54
+ "learning_rate": 0.0005990564327855827,
55
+ "loss": 1.057,
56
+ "step": 1750
57
+ },
58
+ {
59
+ "epoch": 0.04,
60
+ "learning_rate": 0.0005986415767719254,
61
+ "loss": 1.0441,
62
+ "step": 2000
63
+ },
64
+ {
65
+ "epoch": 0.04,
66
+ "learning_rate": 0.0005981515393383762,
67
+ "loss": 1.035,
68
+ "step": 2250
69
+ },
70
+ {
71
+ "epoch": 0.05,
72
+ "learning_rate": 0.0005975864438492385,
73
+ "loss": 1.0263,
74
+ "step": 2500
75
+ },
76
+ {
77
+ "epoch": 0.05,
78
+ "eval_loss": 0.9570981860160828,
79
+ "eval_runtime": 730.5149,
80
+ "eval_samples_per_second": 140.175,
81
+ "eval_steps_per_second": 4.38,
82
+ "step": 2500
83
+ },
84
+ {
85
+ "epoch": 0.05,
86
+ "learning_rate": 0.0005969464325642798,
87
+ "loss": 1.0184,
88
+ "step": 2750
89
+ },
90
+ {
91
+ "epoch": 0.06,
92
+ "learning_rate": 0.0005962316666029183,
93
+ "loss": 1.0113,
94
+ "step": 3000
95
+ },
96
+ {
97
+ "epoch": 0.06,
98
+ "learning_rate": 0.0005954423259036624,
99
+ "loss": 1.0046,
100
+ "step": 3250
101
+ },
102
+ {
103
+ "epoch": 0.07,
104
+ "learning_rate": 0.0005945786091788119,
105
+ "loss": 0.9991,
106
+ "step": 3500
107
+ },
108
+ {
109
+ "epoch": 0.07,
110
+ "learning_rate": 0.0005936407338644336,
111
+ "loss": 0.9945,
112
+ "step": 3750
113
+ },
114
+ {
115
+ "epoch": 0.08,
116
+ "learning_rate": 0.0005926289360656221,
117
+ "loss": 0.9888,
118
+ "step": 4000
119
+ },
120
+ {
121
+ "epoch": 0.08,
122
+ "learning_rate": 0.0005915434704970625,
123
+ "loss": 0.9842,
124
+ "step": 4250
125
+ },
126
+ {
127
+ "epoch": 0.09,
128
+ "learning_rate": 0.0005903846104189068,
129
+ "loss": 0.9801,
130
+ "step": 4500
131
+ },
132
+ {
133
+ "epoch": 0.09,
134
+ "learning_rate": 0.0005891526475679825,
135
+ "loss": 0.9767,
136
+ "step": 4750
137
+ },
138
+ {
139
+ "epoch": 0.1,
140
+ "learning_rate": 0.0005878478920843492,
141
+ "loss": 0.9721,
142
+ "step": 5000
143
+ },
144
+ {
145
+ "epoch": 0.1,
146
+ "eval_loss": 0.9183236956596375,
147
+ "eval_runtime": 729.0743,
148
+ "eval_samples_per_second": 140.452,
149
+ "eval_steps_per_second": 4.389,
150
+ "step": 5000
151
+ },
152
+ {
153
+ "epoch": 0.1,
154
+ "learning_rate": 0.0005864706724332221,
155
+ "loss": 0.9676,
156
+ "step": 5250
157
+ },
158
+ {
159
+ "epoch": 0.11,
160
+ "learning_rate": 0.0005850213353222835,
161
+ "loss": 0.9645,
162
+ "step": 5500
163
+ },
164
+ {
165
+ "epoch": 0.11,
166
+ "learning_rate": 0.0005835002456144005,
167
+ "loss": 0.9605,
168
+ "step": 5750
169
+ },
170
+ {
171
+ "epoch": 0.12,
172
+ "learning_rate": 0.0005819077862357724,
173
+ "loss": 0.9581,
174
+ "step": 6000
175
+ },
176
+ {
177
+ "epoch": 0.12,
178
+ "learning_rate": 0.000580244358079532,
179
+ "loss": 0.9546,
180
+ "step": 6250
181
+ },
182
+ {
183
+ "epoch": 0.13,
184
+ "learning_rate": 0.0005785103799048218,
185
+ "loss": 0.9522,
186
+ "step": 6500
187
+ },
188
+ {
189
+ "epoch": 0.13,
190
+ "learning_rate": 0.0005767062882313744,
191
+ "loss": 0.9495,
192
+ "step": 6750
193
+ },
194
+ {
195
+ "epoch": 0.14,
196
+ "learning_rate": 0.0005748325372296208,
197
+ "loss": 0.9469,
198
+ "step": 7000
199
+ },
200
+ {
201
+ "epoch": 0.14,
202
+ "learning_rate": 0.0005728895986063555,
203
+ "loss": 0.9439,
204
+ "step": 7250
205
+ },
206
+ {
207
+ "epoch": 0.15,
208
+ "learning_rate": 0.0005708779614859863,
209
+ "loss": 0.9416,
210
+ "step": 7500
211
+ },
212
+ {
213
+ "epoch": 0.15,
214
+ "eval_loss": 0.8962864875793457,
215
+ "eval_runtime": 729.2354,
216
+ "eval_samples_per_second": 140.421,
217
+ "eval_steps_per_second": 4.388,
218
+ "step": 7500
219
+ },
220
+ {
221
+ "epoch": 0.15,
222
+ "learning_rate": 0.0005687981322874007,
223
+ "loss": 0.9409,
224
+ "step": 7750
225
+ },
226
+ {
227
+ "epoch": 0.16,
228
+ "learning_rate": 0.000566650634596477,
229
+ "loss": 0.9379,
230
+ "step": 8000
231
+ },
232
+ {
233
+ "epoch": 0.16,
234
+ "learning_rate": 0.0005644360090342746,
235
+ "loss": 0.936,
236
+ "step": 8250
237
+ },
238
+ {
239
+ "epoch": 0.17,
240
+ "learning_rate": 0.0005621548131209354,
241
+ "loss": 0.9333,
242
+ "step": 8500
243
+ },
244
+ {
245
+ "epoch": 0.17,
246
+ "learning_rate": 0.0005598076211353316,
247
+ "loss": 0.9319,
248
+ "step": 8750
249
+ },
250
+ {
251
+ "epoch": 0.18,
252
+ "learning_rate": 0.000557395023970493,
253
+ "loss": 0.93,
254
+ "step": 9000
255
+ },
256
+ {
257
+ "epoch": 0.18,
258
+ "learning_rate": 0.0005549176289848543,
259
+ "loss": 0.9278,
260
+ "step": 9250
261
+ },
262
+ {
263
+ "epoch": 0.19,
264
+ "learning_rate": 0.0005523760598493544,
265
+ "loss": 0.9263,
266
+ "step": 9500
267
+ },
268
+ {
269
+ "epoch": 0.19,
270
+ "learning_rate": 0.0005497709563904314,
271
+ "loss": 0.9248,
272
+ "step": 9750
273
+ },
274
+ {
275
+ "epoch": 0.2,
276
+ "learning_rate": 0.0005471029744289498,
277
+ "loss": 0.9239,
278
+ "step": 10000
279
+ },
280
+ {
281
+ "epoch": 0.2,
282
+ "eval_loss": 0.8806753754615784,
283
+ "eval_runtime": 883.4007,
284
+ "eval_samples_per_second": 115.916,
285
+ "eval_steps_per_second": 3.622,
286
+ "step": 10000
287
+ },
288
+ {
289
+ "epoch": 0.2,
290
+ "learning_rate": 0.0005443727856151006,
291
+ "loss": 0.9223,
292
+ "step": 10250
293
+ },
294
+ {
295
+ "epoch": 0.21,
296
+ "learning_rate": 0.0005415810772593175,
297
+ "loss": 0.921,
298
+ "step": 10500
299
+ },
300
+ {
301
+ "epoch": 0.21,
302
+ "learning_rate": 0.0005387285521592496,
303
+ "loss": 0.9192,
304
+ "step": 10750
305
+ },
306
+ {
307
+ "epoch": 0.22,
308
+ "learning_rate": 0.0005358159284228363,
309
+ "loss": 0.9175,
310
+ "step": 11000
311
+ },
312
+ {
313
+ "epoch": 0.22,
314
+ "learning_rate": 0.000532843939287527,
315
+ "loss": 0.9157,
316
+ "step": 11250
317
+ },
318
+ {
319
+ "epoch": 0.23,
320
+ "learning_rate": 0.0005298133329356933,
321
+ "loss": 0.9147,
322
+ "step": 11500
323
+ },
324
+ {
325
+ "epoch": 0.23,
326
+ "learning_rate": 0.0005267248723062775,
327
+ "loss": 0.9133,
328
+ "step": 11750
329
+ },
330
+ {
331
+ "epoch": 0.24,
332
+ "learning_rate": 0.0005235793349027264,
333
+ "loss": 0.913,
334
+ "step": 12000
335
+ },
336
+ {
337
+ "epoch": 0.24,
338
+ "learning_rate": 0.0005203775125972599,
339
+ "loss": 0.9112,
340
+ "step": 12250
341
+ },
342
+ {
343
+ "epoch": 0.25,
344
+ "learning_rate": 0.000517120211431521,
345
+ "loss": 0.9111,
346
+ "step": 12500
347
+ },
348
+ {
349
+ "epoch": 0.25,
350
+ "eval_loss": 0.8689142465591431,
351
+ "eval_runtime": 886.9621,
352
+ "eval_samples_per_second": 115.45,
353
+ "eval_steps_per_second": 3.608,
354
+ "step": 12500
355
+ },
356
+ {
357
+ "epoch": 0.25,
358
+ "learning_rate": 0.0005138082514136589,
359
+ "loss": 0.9088,
360
+ "step": 12750
361
+ },
362
+ {
363
+ "epoch": 0.26,
364
+ "learning_rate": 0.0005104424663118964,
365
+ "loss": 0.9077,
366
+ "step": 13000
367
+ },
368
+ {
369
+ "epoch": 0.26,
370
+ "learning_rate": 0.0005070237034446336,
371
+ "loss": 0.9066,
372
+ "step": 13250
373
+ },
374
+ {
375
+ "epoch": 0.27,
376
+ "learning_rate": 0.0005035528234671396,
377
+ "loss": 0.9049,
378
+ "step": 13500
379
+ },
380
+ {
381
+ "epoch": 0.27,
382
+ "learning_rate": 0.0005000307001548875,
383
+ "loss": 0.9038,
384
+ "step": 13750
385
+ },
386
+ {
387
+ "epoch": 0.28,
388
+ "learning_rate": 0.0004964582201835855,
389
+ "loss": 0.9034,
390
+ "step": 14000
391
+ },
392
+ {
393
+ "epoch": 0.28,
394
+ "learning_rate": 0.0004928362829059618,
395
+ "loss": 0.9023,
396
+ "step": 14250
397
+ },
398
+ {
399
+ "epoch": 0.29,
400
+ "learning_rate": 0.0004891658001253567,
401
+ "loss": 0.9012,
402
+ "step": 14500
403
+ },
404
+ {
405
+ "epoch": 0.29,
406
+ "learning_rate": 0.00048544769586618153,
407
+ "loss": 0.9001,
408
+ "step": 14750
409
+ },
410
+ {
411
+ "epoch": 0.3,
412
+ "learning_rate": 0.00048168290614129995,
413
+ "loss": 0.8996,
414
+ "step": 15000
415
+ },
416
+ {
417
+ "epoch": 0.3,
418
+ "eval_loss": 0.8599066138267517,
419
+ "eval_runtime": 890.9527,
420
+ "eval_samples_per_second": 114.933,
421
+ "eval_steps_per_second": 3.592,
422
+ "step": 15000
423
+ },
424
+ {
425
+ "epoch": 0.3,
426
+ "learning_rate": 0.00047787237871639213,
427
+ "loss": 0.898,
428
+ "step": 15250
429
+ },
430
+ {
431
+ "epoch": 0.31,
432
+ "learning_rate": 0.0004740170728713594,
433
+ "loss": 0.8978,
434
+ "step": 15500
435
+ },
436
+ {
437
+ "epoch": 0.31,
438
+ "learning_rate": 0.0004701179591588311,
439
+ "loss": 0.8967,
440
+ "step": 15750
441
+ },
442
+ {
443
+ "epoch": 0.32,
444
+ "learning_rate": 0.00046617601915983307,
445
+ "loss": 0.8956,
446
+ "step": 16000
447
+ },
448
+ {
449
+ "epoch": 0.32,
450
+ "learning_rate": 0.00046219224523667927,
451
+ "loss": 0.8937,
452
+ "step": 16250
453
+ },
454
+ {
455
+ "epoch": 0.33,
456
+ "learning_rate": 0.00045816764028315066,
457
+ "loss": 0.894,
458
+ "step": 16500
459
+ },
460
+ {
461
+ "epoch": 0.33,
462
+ "learning_rate": 0.0004541032174720219,
463
+ "loss": 0.8929,
464
+ "step": 16750
465
+ },
466
+ {
467
+ "epoch": 0.34,
468
+ "learning_rate": 0.00045,
469
+ "loss": 0.8927,
470
+ "step": 17000
471
+ },
472
+ {
473
+ "epoch": 0.34,
474
+ "learning_rate": 0.00044585902083014057,
475
+ "loss": 0.8905,
476
+ "step": 17250
477
+ },
478
+ {
479
+ "epoch": 0.35,
480
+ "learning_rate": 0.0004416813224318048,
481
+ "loss": 0.8909,
482
+ "step": 17500
483
+ },
484
+ {
485
+ "epoch": 0.35,
486
+ "eval_loss": 0.8517666459083557,
487
+ "eval_runtime": 888.2193,
488
+ "eval_samples_per_second": 115.287,
489
+ "eval_steps_per_second": 3.603,
490
+ "step": 17500
491
+ },
492
+ {
493
+ "epoch": 0.35,
494
+ "learning_rate": 0.00043746795651822306,
495
+ "loss": 0.8893,
496
+ "step": 17750
497
+ },
498
+ {
499
+ "epoch": 0.36,
500
+ "learning_rate": 0.0004332199837817322,
501
+ "loss": 0.8891,
502
+ "step": 18000
503
+ },
504
+ {
505
+ "epoch": 0.36,
506
+ "learning_rate": 0.0004289384736267515,
507
+ "loss": 0.8889,
508
+ "step": 18250
509
+ },
510
+ {
511
+ "epoch": 0.37,
512
+ "learning_rate": 0.00042462450390056593,
513
+ "loss": 0.8883,
514
+ "step": 18500
515
+ },
516
+ {
517
+ "epoch": 0.37,
518
+ "learning_rate": 0.0004202791606219841,
519
+ "loss": 0.8873,
520
+ "step": 18750
521
+ },
522
+ {
523
+ "epoch": 0.38,
524
+ "learning_rate": 0.0004159035377079385,
525
+ "loss": 0.8868,
526
+ "step": 19000
527
+ },
528
+ {
529
+ "epoch": 0.38,
530
+ "learning_rate": 0.0004114987366980982,
531
+ "loss": 0.8857,
532
+ "step": 19250
533
+ },
534
+ {
535
+ "epoch": 0.39,
536
+ "learning_rate": 0.0004070658664775615,
537
+ "loss": 0.8838,
538
+ "step": 19500
539
+ },
540
+ {
541
+ "epoch": 0.39,
542
+ "learning_rate": 0.00040260604299770063,
543
+ "loss": 0.8838,
544
+ "step": 19750
545
+ },
546
+ {
547
+ "epoch": 0.4,
548
+ "learning_rate": 0.0003981203889952265,
549
+ "loss": 0.8837,
550
+ "step": 20000
551
+ },
552
+ {
553
+ "epoch": 0.4,
554
+ "eval_loss": 0.8464268445968628,
555
+ "eval_runtime": 890.045,
556
+ "eval_samples_per_second": 115.05,
557
+ "eval_steps_per_second": 3.595,
558
+ "step": 20000
559
+ },
560
+ {
561
+ "epoch": 0.4,
562
+ "learning_rate": 0.0003936100337095461,
563
+ "loss": 0.8831,
564
+ "step": 20250
565
+ },
566
+ {
567
+ "epoch": 0.41,
568
+ "learning_rate": 0.0003890761125984825,
569
+ "loss": 0.8821,
570
+ "step": 20500
571
+ },
572
+ {
573
+ "epoch": 0.41,
574
+ "learning_rate": 0.0003845197670524289,
575
+ "loss": 0.8813,
576
+ "step": 20750
577
+ },
578
+ {
579
+ "epoch": 0.42,
580
+ "learning_rate": 0.0003799421441070104,
581
+ "loss": 0.8817,
582
+ "step": 21000
583
+ },
584
+ {
585
+ "epoch": 0.42,
586
+ "learning_rate": 0.0003753443961543237,
587
+ "loss": 0.88,
588
+ "step": 21250
589
+ },
590
+ {
591
+ "epoch": 0.43,
592
+ "learning_rate": 0.0003707276806528282,
593
+ "loss": 0.8802,
594
+ "step": 21500
595
+ },
596
+ {
597
+ "epoch": 0.43,
598
+ "learning_rate": 0.0003660931598359622,
599
+ "loss": 0.8792,
600
+ "step": 21750
601
+ },
602
+ {
603
+ "epoch": 0.44,
604
+ "learning_rate": 0.0003614420004195572,
605
+ "loss": 0.8785,
606
+ "step": 22000
607
+ },
608
+ {
609
+ "epoch": 0.44,
610
+ "learning_rate": 0.000356775373308123,
611
+ "loss": 0.878,
612
+ "step": 22250
613
+ },
614
+ {
615
+ "epoch": 0.45,
616
+ "learning_rate": 0.0003520944533000791,
617
+ "loss": 0.8775,
618
+ "step": 22500
619
+ },
620
+ {
621
+ "epoch": 0.45,
622
+ "eval_loss": 0.8403845429420471,
623
+ "eval_runtime": 891.8066,
624
+ "eval_samples_per_second": 114.823,
625
+ "eval_steps_per_second": 3.588,
626
+ "step": 22500
627
+ },
628
+ {
629
+ "epoch": 0.45,
630
+ "learning_rate": 0.00034740041879200497,
631
+ "loss": 0.8772,
632
+ "step": 22750
633
+ },
634
+ {
635
+ "epoch": 0.46,
636
+ "learning_rate": 0.00034269445148198553,
637
+ "loss": 0.876,
638
+ "step": 23000
639
+ },
640
+ {
641
+ "epoch": 0.46,
642
+ "learning_rate": 0.00033797773607212474,
643
+ "loss": 0.8746,
644
+ "step": 23250
645
+ },
646
+ {
647
+ "epoch": 0.47,
648
+ "learning_rate": 0.0003332514599703033,
649
+ "loss": 0.8755,
650
+ "step": 23500
651
+ },
652
+ {
653
+ "epoch": 0.47,
654
+ "learning_rate": 0.0003285168129912547,
655
+ "loss": 0.875,
656
+ "step": 23750
657
+ },
658
+ {
659
+ "epoch": 0.48,
660
+ "learning_rate": 0.0003237749870570365,
661
+ "loss": 0.8745,
662
+ "step": 24000
663
+ },
664
+ {
665
+ "epoch": 0.48,
666
+ "learning_rate": 0.0003190271758969693,
667
+ "loss": 0.8733,
668
+ "step": 24250
669
+ },
670
+ {
671
+ "epoch": 0.49,
672
+ "learning_rate": 0.00031427457474712274,
673
+ "loss": 0.8731,
674
+ "step": 24500
675
+ },
676
+ {
677
+ "epoch": 0.49,
678
+ "learning_rate": 0.0003095183800494203,
679
+ "loss": 0.8732,
680
+ "step": 24750
681
+ },
682
+ {
683
+ "epoch": 0.5,
684
+ "learning_rate": 0.00030475978915044235,
685
+ "loss": 0.8716,
686
+ "step": 25000
687
+ },
688
+ {
689
+ "epoch": 0.5,
690
+ "eval_loss": 0.8353695869445801,
691
+ "eval_runtime": 897.249,
692
+ "eval_samples_per_second": 114.127,
693
+ "eval_steps_per_second": 3.566,
694
+ "step": 25000
695
+ },
696
+ {
697
+ "epoch": 0.5,
698
+ "learning_rate": 0.0003,
699
+ "loss": 0.871,
700
+ "step": 25250
701
+ },
702
+ {
703
+ "epoch": 0.51,
704
+ "learning_rate": 0.0002952402108495576,
705
+ "loss": 0.8709,
706
+ "step": 25500
707
+ },
708
+ {
709
+ "epoch": 0.51,
710
+ "learning_rate": 0.00029048161995057974,
711
+ "loss": 0.87,
712
+ "step": 25750
713
+ },
714
+ {
715
+ "epoch": 0.52,
716
+ "learning_rate": 0.0002857254252528773,
717
+ "loss": 0.8699,
718
+ "step": 26000
719
+ },
720
+ {
721
+ "epoch": 0.52,
722
+ "learning_rate": 0.00028097282410303066,
723
+ "loss": 0.869,
724
+ "step": 26250
725
+ },
726
+ {
727
+ "epoch": 0.53,
728
+ "learning_rate": 0.0002762250129429634,
729
+ "loss": 0.8684,
730
+ "step": 26500
731
+ },
732
+ {
733
+ "epoch": 0.53,
734
+ "learning_rate": 0.00027148318700874523,
735
+ "loss": 0.8687,
736
+ "step": 26750
737
+ },
738
+ {
739
+ "epoch": 0.54,
740
+ "learning_rate": 0.0002667485400296967,
741
+ "loss": 0.8686,
742
+ "step": 27000
743
+ },
744
+ {
745
+ "epoch": 0.54,
746
+ "learning_rate": 0.00026202226392787515,
747
+ "loss": 0.8681,
748
+ "step": 27250
749
+ },
750
+ {
751
+ "epoch": 0.55,
752
+ "learning_rate": 0.0002573055485180145,
753
+ "loss": 0.8669,
754
+ "step": 27500
755
+ },
756
+ {
757
+ "epoch": 0.55,
758
+ "eval_loss": 0.8309236764907837,
759
+ "eval_runtime": 889.4471,
760
+ "eval_samples_per_second": 115.128,
761
+ "eval_steps_per_second": 3.598,
762
+ "step": 27500
763
+ },
764
+ {
765
+ "epoch": 0.55,
766
+ "learning_rate": 0.000252599581207995,
767
+ "loss": 0.8677,
768
+ "step": 27750
769
+ },
770
+ {
771
+ "epoch": 0.56,
772
+ "learning_rate": 0.0002479055466999209,
773
+ "loss": 0.8658,
774
+ "step": 28000
775
+ },
776
+ {
777
+ "epoch": 0.56,
778
+ "learning_rate": 0.00024322462669187702,
779
+ "loss": 0.866,
780
+ "step": 28250
781
+ },
782
+ {
783
+ "epoch": 0.57,
784
+ "learning_rate": 0.0002385579995804428,
785
+ "loss": 0.8655,
786
+ "step": 28500
787
+ },
788
+ {
789
+ "epoch": 0.57,
790
+ "learning_rate": 0.00023390684016403777,
791
+ "loss": 0.8652,
792
+ "step": 28750
793
+ },
794
+ {
795
+ "epoch": 0.58,
796
+ "learning_rate": 0.00022927231934717176,
797
+ "loss": 0.865,
798
+ "step": 29000
799
+ },
800
+ {
801
+ "epoch": 0.58,
802
+ "learning_rate": 0.00022465560384567624,
803
+ "loss": 0.8641,
804
+ "step": 29250
805
+ },
806
+ {
807
+ "epoch": 0.59,
808
+ "learning_rate": 0.00022005785589298952,
809
+ "loss": 0.8639,
810
+ "step": 29500
811
+ },
812
+ {
813
+ "epoch": 0.59,
814
+ "learning_rate": 0.00021548023294757105,
815
+ "loss": 0.8628,
816
+ "step": 29750
817
+ },
818
+ {
819
+ "epoch": 0.6,
820
+ "learning_rate": 0.00021092388740151762,
821
+ "loss": 0.8633,
822
+ "step": 30000
823
+ },
824
+ {
825
+ "epoch": 0.6,
826
+ "eval_loss": 0.8268718719482422,
827
+ "eval_runtime": 886.7236,
828
+ "eval_samples_per_second": 115.481,
829
+ "eval_steps_per_second": 3.609,
830
+ "step": 30000
831
+ },
832
+ {
833
+ "epoch": 0.6,
834
+ "learning_rate": 0.00020638996629045387,
835
+ "loss": 0.8624,
836
+ "step": 30250
837
+ },
838
+ {
839
+ "epoch": 0.61,
840
+ "learning_rate": 0.0002018796110047735,
841
+ "loss": 0.8614,
842
+ "step": 30500
843
+ },
844
+ {
845
+ "epoch": 0.61,
846
+ "learning_rate": 0.00019739395700229937,
847
+ "loss": 0.8612,
848
+ "step": 30750
849
+ },
850
+ {
851
+ "epoch": 0.62,
852
+ "learning_rate": 0.00019293413352243846,
853
+ "loss": 0.8608,
854
+ "step": 31000
855
+ },
856
+ {
857
+ "epoch": 0.62,
858
+ "learning_rate": 0.00018850126330190176,
859
+ "loss": 0.8606,
860
+ "step": 31250
861
+ },
862
+ {
863
+ "epoch": 0.63,
864
+ "learning_rate": 0.00018409646229206137,
865
+ "loss": 0.8601,
866
+ "step": 31500
867
+ },
868
+ {
869
+ "epoch": 0.63,
870
+ "learning_rate": 0.00017972083937801593,
871
+ "loss": 0.8608,
872
+ "step": 31750
873
+ },
874
+ {
875
+ "epoch": 0.64,
876
+ "learning_rate": 0.0001753754960994341,
877
+ "loss": 0.8597,
878
+ "step": 32000
879
+ },
880
+ {
881
+ "epoch": 0.64,
882
+ "learning_rate": 0.0001710615263732485,
883
+ "loss": 0.8593,
884
+ "step": 32250
885
+ },
886
+ {
887
+ "epoch": 0.65,
888
+ "learning_rate": 0.00016678001621826772,
889
+ "loss": 0.8581,
890
+ "step": 32500
891
+ },
892
+ {
893
+ "epoch": 0.65,
894
+ "eval_loss": 0.8240156769752502,
895
+ "eval_runtime": 887.3331,
896
+ "eval_samples_per_second": 115.402,
897
+ "eval_steps_per_second": 3.606,
898
+ "step": 32500
899
+ },
900
+ {
901
+ "epoch": 0.65,
902
+ "learning_rate": 0.00016253204348177686,
903
+ "loss": 0.8594,
904
+ "step": 32750
905
+ },
906
+ {
907
+ "epoch": 0.66,
908
+ "learning_rate": 0.00015831867756819522,
909
+ "loss": 0.8591,
910
+ "step": 33000
911
+ },
912
+ {
913
+ "epoch": 0.66,
914
+ "learning_rate": 0.00015414097916985944,
915
+ "loss": 0.8581,
916
+ "step": 33250
917
+ },
918
+ {
919
+ "epoch": 0.67,
920
+ "learning_rate": 0.00015000000000000004,
921
+ "loss": 0.8573,
922
+ "step": 33500
923
+ },
924
+ {
925
+ "epoch": 0.67,
926
+ "learning_rate": 0.00014589678252797817,
927
+ "loss": 0.8576,
928
+ "step": 33750
929
+ },
930
+ {
931
+ "epoch": 0.68,
932
+ "learning_rate": 0.00014183235971684924,
933
+ "loss": 0.8571,
934
+ "step": 34000
935
+ },
936
+ {
937
+ "epoch": 0.68,
938
+ "learning_rate": 0.00013780775476332082,
939
+ "loss": 0.8572,
940
+ "step": 34250
941
+ },
942
+ {
943
+ "epoch": 0.69,
944
+ "learning_rate": 0.0001338239808401669,
945
+ "loss": 0.8566,
946
+ "step": 34500
947
+ },
948
+ {
949
+ "epoch": 0.69,
950
+ "learning_rate": 0.0001298820408411688,
951
+ "loss": 0.8558,
952
+ "step": 34750
953
+ },
954
+ {
955
+ "epoch": 0.7,
956
+ "learning_rate": 0.00012598292712864058,
957
+ "loss": 0.8565,
958
+ "step": 35000
959
+ },
960
+ {
961
+ "epoch": 0.7,
962
+ "eval_loss": 0.820695698261261,
963
+ "eval_runtime": 867.1933,
964
+ "eval_samples_per_second": 118.082,
965
+ "eval_steps_per_second": 3.69,
966
+ "step": 35000
967
+ },
968
+ {
969
+ "epoch": 0.7,
970
+ "learning_rate": 0.0001221276212836079,
971
+ "loss": 0.8564,
972
+ "step": 35250
973
+ },
974
+ {
975
+ "epoch": 0.71,
976
+ "learning_rate": 0.00011831709385870004,
977
+ "loss": 0.8553,
978
+ "step": 35500
979
+ },
980
+ {
981
+ "epoch": 0.71,
982
+ "learning_rate": 0.0001145523041338184,
983
+ "loss": 0.8548,
984
+ "step": 35750
985
+ },
986
+ {
987
+ "epoch": 0.72,
988
+ "learning_rate": 0.00011083419987464334,
989
+ "loss": 0.8551,
990
+ "step": 36000
991
+ },
992
+ {
993
+ "epoch": 0.72,
994
+ "learning_rate": 0.00010716371709403818,
995
+ "loss": 0.8543,
996
+ "step": 36250
997
+ },
998
+ {
999
+ "epoch": 0.73,
1000
+ "learning_rate": 0.00010354177981641449,
1001
+ "loss": 0.8547,
1002
+ "step": 36500
1003
+ },
1004
+ {
1005
+ "epoch": 0.73,
1006
+ "learning_rate": 9.996929984511254e-05,
1007
+ "loss": 0.8538,
1008
+ "step": 36750
1009
+ },
1010
+ {
1011
+ "epoch": 0.74,
1012
+ "learning_rate": 9.644717653286037e-05,
1013
+ "loss": 0.8539,
1014
+ "step": 37000
1015
+ },
1016
+ {
1017
+ "epoch": 0.74,
1018
+ "learning_rate": 9.297629655536644e-05,
1019
+ "loss": 0.8547,
1020
+ "step": 37250
1021
+ },
1022
+ {
1023
+ "epoch": 0.75,
1024
+ "learning_rate": 8.955753368810358e-05,
1025
+ "loss": 0.8531,
1026
+ "step": 37500
1027
+ },
1028
+ {
1029
+ "epoch": 0.75,
1030
+ "eval_loss": 0.8180103302001953,
1031
+ "eval_runtime": 728.2589,
1032
+ "eval_samples_per_second": 140.609,
1033
+ "eval_steps_per_second": 4.394,
1034
+ "step": 37500
1035
+ },
1036
+ {
1037
+ "epoch": 0.75,
1038
+ "learning_rate": 8.619174858634122e-05,
1039
+ "loss": 0.8533,
1040
+ "step": 37750
1041
+ },
1042
+ {
1043
+ "epoch": 0.76,
1044
+ "learning_rate": 8.287978856847894e-05,
1045
+ "loss": 0.8533,
1046
+ "step": 38000
1047
+ },
1048
+ {
1049
+ "epoch": 0.76,
1050
+ "learning_rate": 7.962248740274003e-05,
1051
+ "loss": 0.8531,
1052
+ "step": 38250
1053
+ },
1054
+ {
1055
+ "epoch": 0.77,
1056
+ "learning_rate": 7.642066509727359e-05,
1057
+ "loss": 0.8523,
1058
+ "step": 38500
1059
+ },
1060
+ {
1061
+ "epoch": 0.77,
1062
+ "learning_rate": 7.327512769372254e-05,
1063
+ "loss": 0.8523,
1064
+ "step": 38750
1065
+ },
1066
+ {
1067
+ "epoch": 0.78,
1068
+ "learning_rate": 7.018666706430662e-05,
1069
+ "loss": 0.8524,
1070
+ "step": 39000
1071
+ },
1072
+ {
1073
+ "epoch": 0.78,
1074
+ "learning_rate": 6.715606071247291e-05,
1075
+ "loss": 0.8519,
1076
+ "step": 39250
1077
+ },
1078
+ {
1079
+ "epoch": 0.79,
1080
+ "learning_rate": 6.418407157716381e-05,
1081
+ "loss": 0.851,
1082
+ "step": 39500
1083
+ },
1084
+ {
1085
+ "epoch": 0.79,
1086
+ "learning_rate": 6.127144784075033e-05,
1087
+ "loss": 0.8512,
1088
+ "step": 39750
1089
+ },
1090
+ {
1091
+ "epoch": 0.8,
1092
+ "learning_rate": 5.841892274068241e-05,
1093
+ "loss": 0.8508,
1094
+ "step": 40000
1095
+ },
1096
+ {
1097
+ "epoch": 0.8,
1098
+ "eval_loss": 0.8166452050209045,
1099
+ "eval_runtime": 727.869,
1100
+ "eval_samples_per_second": 140.685,
1101
+ "eval_steps_per_second": 4.396,
1102
+ "step": 40000
1103
+ },
1104
+ {
1105
+ "epoch": 0.8,
1106
+ "learning_rate": 5.562721438489928e-05,
1107
+ "loss": 0.8508,
1108
+ "step": 40250
1109
+ },
1110
+ {
1111
+ "epoch": 0.81,
1112
+ "learning_rate": 5.2897025571050186e-05,
1113
+ "loss": 0.8512,
1114
+ "step": 40500
1115
+ },
1116
+ {
1117
+ "epoch": 0.81,
1118
+ "learning_rate": 5.022904360956861e-05,
1119
+ "loss": 0.8513,
1120
+ "step": 40750
1121
+ },
1122
+ {
1123
+ "epoch": 0.82,
1124
+ "learning_rate": 4.76239401506456e-05,
1125
+ "loss": 0.8503,
1126
+ "step": 41000
1127
+ },
1128
+ {
1129
+ "epoch": 0.82,
1130
+ "learning_rate": 4.5082371015145716e-05,
1131
+ "loss": 0.8506,
1132
+ "step": 41250
1133
+ },
1134
+ {
1135
+ "epoch": 0.83,
1136
+ "learning_rate": 4.260497602950688e-05,
1137
+ "loss": 0.8495,
1138
+ "step": 41500
1139
+ },
1140
+ {
1141
+ "epoch": 0.83,
1142
+ "learning_rate": 4.019237886466838e-05,
1143
+ "loss": 0.8504,
1144
+ "step": 41750
1145
+ },
1146
+ {
1147
+ "epoch": 0.84,
1148
+ "learning_rate": 3.784518687906452e-05,
1149
+ "loss": 0.85,
1150
+ "step": 42000
1151
+ },
1152
+ {
1153
+ "epoch": 0.84,
1154
+ "learning_rate": 3.556399096572541e-05,
1155
+ "loss": 0.8498,
1156
+ "step": 42250
1157
+ },
1158
+ {
1159
+ "epoch": 0.85,
1160
+ "learning_rate": 3.3349365403522986e-05,
1161
+ "loss": 0.8494,
1162
+ "step": 42500
1163
+ },
1164
+ {
1165
+ "epoch": 0.85,
1166
+ "eval_loss": 0.8152065873146057,
1167
+ "eval_runtime": 727.6431,
1168
+ "eval_samples_per_second": 140.728,
1169
+ "eval_steps_per_second": 4.398,
1170
+ "step": 42500
1171
+ },
1172
+ {
1173
+ "epoch": 0.85,
1174
+ "learning_rate": 3.120186771259927e-05,
1175
+ "loss": 0.8499,
1176
+ "step": 42750
1177
+ },
1178
+ {
1179
+ "epoch": 0.86,
1180
+ "learning_rate": 2.9122038514013678e-05,
1181
+ "loss": 0.8494,
1182
+ "step": 43000
1183
+ },
1184
+ {
1185
+ "epoch": 0.86,
1186
+ "learning_rate": 2.7110401393644464e-05,
1187
+ "loss": 0.8494,
1188
+ "step": 43250
1189
+ },
1190
+ {
1191
+ "epoch": 0.87,
1192
+ "learning_rate": 2.516746277037912e-05,
1193
+ "loss": 0.8487,
1194
+ "step": 43500
1195
+ },
1196
+ {
1197
+ "epoch": 0.87,
1198
+ "learning_rate": 2.329371176862562e-05,
1199
+ "loss": 0.8497,
1200
+ "step": 43750
1201
+ },
1202
+ {
1203
+ "epoch": 0.88,
1204
+ "learning_rate": 2.148962009517823e-05,
1205
+ "loss": 0.8483,
1206
+ "step": 44000
1207
+ },
1208
+ {
1209
+ "epoch": 0.88,
1210
+ "learning_rate": 1.9755641920468003e-05,
1211
+ "loss": 0.8494,
1212
+ "step": 44250
1213
+ },
1214
+ {
1215
+ "epoch": 0.89,
1216
+ "learning_rate": 1.8092213764227503e-05,
1217
+ "loss": 0.8479,
1218
+ "step": 44500
1219
+ },
1220
+ {
1221
+ "epoch": 0.89,
1222
+ "learning_rate": 1.6499754385599462e-05,
1223
+ "loss": 0.8487,
1224
+ "step": 44750
1225
+ },
1226
+ {
1227
+ "epoch": 0.9,
1228
+ "learning_rate": 1.4978664677716402e-05,
1229
+ "loss": 0.8483,
1230
+ "step": 45000
1231
+ },
1232
+ {
1233
+ "epoch": 0.9,
1234
+ "eval_loss": 0.814320981502533,
1235
+ "eval_runtime": 728.0287,
1236
+ "eval_samples_per_second": 140.654,
1237
+ "eval_steps_per_second": 4.395,
1238
+ "step": 45000
1239
+ },
1240
+ {
1241
+ "epoch": 0.9,
1242
+ "learning_rate": 1.3529327566777836e-05,
1243
+ "loss": 0.8489,
1244
+ "step": 45250
1245
+ },
1246
+ {
1247
+ "epoch": 0.91,
1248
+ "learning_rate": 1.2152107915650821e-05,
1249
+ "loss": 0.8485,
1250
+ "step": 45500
1251
+ },
1252
+ {
1253
+ "epoch": 0.91,
1254
+ "learning_rate": 1.0847352432017387e-05,
1255
+ "loss": 0.8487,
1256
+ "step": 45750
1257
+ },
1258
+ {
1259
+ "epoch": 0.92,
1260
+ "learning_rate": 9.615389581093124e-06,
1261
+ "loss": 0.8484,
1262
+ "step": 46000
1263
+ },
1264
+ {
1265
+ "epoch": 0.92,
1266
+ "learning_rate": 8.456529502937504e-06,
1267
+ "loss": 0.8484,
1268
+ "step": 46250
1269
+ },
1270
+ {
1271
+ "epoch": 0.93,
1272
+ "learning_rate": 7.371063934377885e-06,
1273
+ "loss": 0.8483,
1274
+ "step": 46500
1275
+ },
1276
+ {
1277
+ "epoch": 0.93,
1278
+ "learning_rate": 6.35926613556641e-06,
1279
+ "loss": 0.8482,
1280
+ "step": 46750
1281
+ },
1282
+ {
1283
+ "epoch": 0.94,
1284
+ "learning_rate": 5.421390821187988e-06,
1285
+ "loss": 0.8479,
1286
+ "step": 47000
1287
+ },
1288
+ {
1289
+ "epoch": 0.94,
1290
+ "learning_rate": 4.557674096337593e-06,
1291
+ "loss": 0.8478,
1292
+ "step": 47250
1293
+ },
1294
+ {
1295
+ "epoch": 0.95,
1296
+ "learning_rate": 3.768333397081713e-06,
1297
+ "loss": 0.8484,
1298
+ "step": 47500
1299
+ },
1300
+ {
1301
+ "epoch": 0.95,
1302
+ "eval_loss": 0.8137823343276978,
1303
+ "eval_runtime": 728.9533,
1304
+ "eval_samples_per_second": 140.475,
1305
+ "eval_steps_per_second": 4.39,
1306
+ "step": 47500
1307
+ },
1308
+ {
1309
+ "epoch": 0.95,
1310
+ "learning_rate": 3.0535674357201944e-06,
1311
+ "loss": 0.848,
1312
+ "step": 47750
1313
+ },
1314
+ {
1315
+ "epoch": 0.96,
1316
+ "learning_rate": 2.4135561507613975e-06,
1317
+ "loss": 0.8481,
1318
+ "step": 48000
1319
+ },
1320
+ {
1321
+ "epoch": 0.96,
1322
+ "learning_rate": 1.848460661623763e-06,
1323
+ "loss": 0.8478,
1324
+ "step": 48250
1325
+ },
1326
+ {
1327
+ "epoch": 0.97,
1328
+ "learning_rate": 1.3584232280746231e-06,
1329
+ "loss": 0.8475,
1330
+ "step": 48500
1331
+ },
1332
+ {
1333
+ "epoch": 0.97,
1334
+ "learning_rate": 9.435672144173178e-07,
1335
+ "loss": 0.8482,
1336
+ "step": 48750
1337
+ },
1338
+ {
1339
+ "epoch": 0.98,
1340
+ "learning_rate": 6.03997058434702e-07,
1341
+ "loss": 0.8484,
1342
+ "step": 49000
1343
+ },
1344
+ {
1345
+ "epoch": 0.98,
1346
+ "learning_rate": 3.397982450976111e-07,
1347
+ "loss": 0.848,
1348
+ "step": 49250
1349
+ },
1350
+ {
1351
+ "epoch": 0.99,
1352
+ "learning_rate": 1.5103728504447522e-07,
1353
+ "loss": 0.8481,
1354
+ "step": 49500
1355
+ },
1356
+ {
1357
+ "epoch": 0.99,
1358
+ "learning_rate": 3.776169783747951e-08,
1359
+ "loss": 0.8484,
1360
+ "step": 49750
1361
+ },
1362
+ {
1363
+ "epoch": 1.0,
1364
+ "learning_rate": 0.0,
1365
+ "loss": 0.8475,
1366
+ "step": 50000
1367
+ },
1368
+ {
1369
+ "epoch": 1.0,
1370
+ "eval_loss": 0.8137637972831726,
1371
+ "eval_runtime": 727.8603,
1372
+ "eval_samples_per_second": 140.686,
1373
+ "eval_steps_per_second": 4.396,
1374
+ "step": 50000
1375
+ },
1376
+ {
1377
+ "epoch": 1.0,
1378
+ "step": 50000,
1379
+ "total_flos": 2.180439447726719e+19,
1380
+ "train_loss": 0.9020122967529297,
1381
+ "train_runtime": 575442.638,
1382
+ "train_samples_per_second": 44.487,
1383
+ "train_steps_per_second": 0.087
1384
+ }
1385
+ ],
1386
+ "max_steps": 50000,
1387
+ "num_train_epochs": 1,
1388
+ "total_flos": 2.180439447726719e+19,
1389
+ "trial_name": null,
1390
+ "trial_params": null
1391
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:567a0992a3ba305dc96766572e6853ebcf2a017efb5481409fce52d6433ac7fb
3
+ size 3375