noanabeshima
commited on
Commit
•
de1867f
1
Parent(s):
7fedee0
Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- attn_test/A0_S-2_R1_P0.pt +3 -0
- attn_test/A0_S-2_R1_P0_config.json +38 -0
- attn_test/A0_S-3_R1_P0.pt +3 -0
- attn_test/A0_S-3_R1_P0_config.json +38 -0
- attn_test/A0_S-4_R1_P0.pt +3 -0
- attn_test/A0_S-4_R1_P0_config.json +38 -0
- attn_test/A0_S-5_R1_P0.pt +3 -0
- attn_test/A0_S-5_R1_P0_config.json +38 -0
- attn_test/A0_S-6_R1_P0.pt +3 -0
- attn_test/A0_S-6_R1_P0_config.json +38 -0
- attn_test/A0_S-7_R1_P0.pt +3 -0
- attn_test/A0_S-7_R1_P0_config.json +38 -0
- attn_test/A0_S-8_R1_P0.pt +3 -0
- attn_test/A0_S-8_R1_P0_config.json +38 -0
- attn_test/A1_S-2_R1_P0.pt +3 -0
- attn_test/A1_S-2_R1_P0_config.json +38 -0
- attn_test/A1_S-3_R1_P0.pt +3 -0
- attn_test/A1_S-3_R1_P0_config.json +38 -0
- attn_test/A1_S-4_R1_P0.pt +3 -0
- attn_test/A1_S-4_R1_P0_config.json +38 -0
- attn_test/A1_S-5_R1_P0.pt +3 -0
- attn_test/A1_S-5_R1_P0_config.json +38 -0
- attn_test/A1_S-6_R1_P0.pt +3 -0
- attn_test/A1_S-6_R1_P0_config.json +38 -0
- attn_test/A1_S-7_R1_P0.pt +3 -0
- attn_test/A1_S-7_R1_P0_config.json +38 -0
- attn_test/A1_S-8_R1_P0.pt +3 -0
- attn_test/A1_S-8_R1_P0_config.json +38 -0
- attn_test/A2_S-2_R1_P0.pt +3 -0
- attn_test/A2_S-2_R1_P0_config.json +38 -0
- attn_test/A2_S-3_R1_P0.pt +3 -0
- attn_test/A2_S-3_R1_P0_config.json +38 -0
- attn_test/A2_S-4_R1_P0.pt +3 -0
- attn_test/A2_S-4_R1_P0_config.json +38 -0
- attn_test/A2_S-5_R1_P0.pt +3 -0
- attn_test/A2_S-5_R1_P0_config.json +38 -0
- attn_test/A2_S-6_R1_P0.pt +3 -0
- attn_test/A2_S-6_R1_P0_config.json +38 -0
- attn_test/A2_S-7_R1_P0.pt +3 -0
- attn_test/A2_S-7_R1_P0_config.json +38 -0
- attn_test/A2_S-8_R1_P0.pt +3 -0
- attn_test/A2_S-8_R1_P0_config.json +38 -0
- attn_test/A3_S-2_R1_P0.pt +3 -0
- attn_test/A3_S-2_R1_P0_config.json +38 -0
- attn_test/A3_S-3_R1_P0.pt +3 -0
- attn_test/A3_S-3_R1_P0_config.json +38 -0
- attn_test/A3_S-4_R1_P0.pt +3 -0
- attn_test/A3_S-4_R1_P0_config.json +38 -0
- attn_test/A3_S-5_R1_P0.pt +3 -0
- attn_test/A3_S-5_R1_P0_config.json +38 -0
attn_test/A0_S-2_R1_P0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6a967d32a0cf8724fa9fe4d2a6aac4925037c75c5307891546056f5fd04dc9fd
|
3 |
+
size 153705080
|
attn_test/A0_S-2_R1_P0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 25000,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 0,
|
20 |
+
"l1_exp": -2,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
|
27 |
+
"project_name": "attn_test",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A0_S-2_R1_P0"
|
38 |
+
}
|
attn_test/A0_S-3_R1_P0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f1b7ccabe7a4a3e5fa5811c6013c57228362bf80734afcbecf1b567569e5f542
|
3 |
+
size 153705080
|
attn_test/A0_S-3_R1_P0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 25000,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 0,
|
20 |
+
"l1_exp": -3,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
|
27 |
+
"project_name": "attn_test",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A0_S-3_R1_P0"
|
38 |
+
}
|
attn_test/A0_S-4_R1_P0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:df8d6af259884d54c4fe117c8b4bff09a04e049187cd6a29f16a1445a09d656c
|
3 |
+
size 153705080
|
attn_test/A0_S-4_R1_P0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 25000,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 0,
|
20 |
+
"l1_exp": -4,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
|
27 |
+
"project_name": "attn_test",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A0_S-4_R1_P0"
|
38 |
+
}
|
attn_test/A0_S-5_R1_P0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3225b3849eaf2b0526808aa02f477084cd39751145125b50345778b11e78d21d
|
3 |
+
size 153705080
|
attn_test/A0_S-5_R1_P0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 25000,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 0,
|
20 |
+
"l1_exp": -5,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
|
27 |
+
"project_name": "attn_test",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A0_S-5_R1_P0"
|
38 |
+
}
|
attn_test/A0_S-6_R1_P0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1aa1970e842e927e4f1c02dee66572818ffea6bc4a40d7d97e62e57d4d95356e
|
3 |
+
size 153705080
|
attn_test/A0_S-6_R1_P0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 25000,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 0,
|
20 |
+
"l1_exp": -6,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
|
27 |
+
"project_name": "attn_test",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A0_S-6_R1_P0"
|
38 |
+
}
|
attn_test/A0_S-7_R1_P0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e63474090a94d6ca4dc76f1198a407692960dc5199abad46ee8409307390d924
|
3 |
+
size 153705080
|
attn_test/A0_S-7_R1_P0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 25000,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 0,
|
20 |
+
"l1_exp": -7,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
|
27 |
+
"project_name": "attn_test",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A0_S-7_R1_P0"
|
38 |
+
}
|
attn_test/A0_S-8_R1_P0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4b7a0ae2e87296395587c99f6d6dda887ce2a22211e868959cd4184c7b32dde7
|
3 |
+
size 153705080
|
attn_test/A0_S-8_R1_P0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 25000,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 0,
|
20 |
+
"l1_exp": -8,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
|
27 |
+
"project_name": "attn_test",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A0_S-8_R1_P0"
|
38 |
+
}
|
attn_test/A1_S-2_R1_P0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9da8e0ed7db2cfd12e35d74f726b1284295347b9695a965f7ddcaffb0449399e
|
3 |
+
size 153705080
|
attn_test/A1_S-2_R1_P0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 25000,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 1,
|
20 |
+
"l1_exp": -2,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
|
27 |
+
"project_name": "attn_test",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A1_S-2_R1_P0"
|
38 |
+
}
|
attn_test/A1_S-3_R1_P0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:eaa30e7dcfdc66166c34f3b63d3c02048c5f7bbcd2c35e0ed6ab7a1c0213d3b5
|
3 |
+
size 153705080
|
attn_test/A1_S-3_R1_P0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 25000,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 1,
|
20 |
+
"l1_exp": -3,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
|
27 |
+
"project_name": "attn_test",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A1_S-3_R1_P0"
|
38 |
+
}
|
attn_test/A1_S-4_R1_P0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c39898304960fef85e4e335e2ef19aa320b9b52addf607631748cafe3af72f68
|
3 |
+
size 153705080
|
attn_test/A1_S-4_R1_P0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 25000,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 1,
|
20 |
+
"l1_exp": -4,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
|
27 |
+
"project_name": "attn_test",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A1_S-4_R1_P0"
|
38 |
+
}
|
attn_test/A1_S-5_R1_P0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f83a42d62c42719e64afb5874fdc7a816aeb66051a9ac0700ef4bc0152de88b3
|
3 |
+
size 153705080
|
attn_test/A1_S-5_R1_P0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 25000,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 1,
|
20 |
+
"l1_exp": -5,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
|
27 |
+
"project_name": "attn_test",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A1_S-5_R1_P0"
|
38 |
+
}
|
attn_test/A1_S-6_R1_P0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a46305261cd3ecf76a415d4b237a6b9e78fe6f98299d32dc1b72c83951e1883f
|
3 |
+
size 153705080
|
attn_test/A1_S-6_R1_P0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 25000,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 1,
|
20 |
+
"l1_exp": -6,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
|
27 |
+
"project_name": "attn_test",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A1_S-6_R1_P0"
|
38 |
+
}
|
attn_test/A1_S-7_R1_P0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9dc01776116cda0db85b31f842153d595aba0d89557090e5ee61d10e83098bfe
|
3 |
+
size 153705080
|
attn_test/A1_S-7_R1_P0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 25000,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 1,
|
20 |
+
"l1_exp": -7,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
|
27 |
+
"project_name": "attn_test",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A1_S-7_R1_P0"
|
38 |
+
}
|
attn_test/A1_S-8_R1_P0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f7f67148f42731c6f0417868ebbb4840f3ca38214c53a43579b7e74c2bd7498e
|
3 |
+
size 153705080
|
attn_test/A1_S-8_R1_P0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 25000,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 1,
|
20 |
+
"l1_exp": -8,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
|
27 |
+
"project_name": "attn_test",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A1_S-8_R1_P0"
|
38 |
+
}
|
attn_test/A2_S-2_R1_P0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:82a94ba450fb8ab21ac7951ba04a152282ccc9048e082a518f865768cf735eaf
|
3 |
+
size 153705080
|
attn_test/A2_S-2_R1_P0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 25000,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 2,
|
20 |
+
"l1_exp": -2,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
|
27 |
+
"project_name": "attn_test",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A2_S-2_R1_P0"
|
38 |
+
}
|
attn_test/A2_S-3_R1_P0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:de6f0b4dad578eab23f3a7e0373c4e2aae4cf2f23a161cf05672185687d0634d
|
3 |
+
size 153705080
|
attn_test/A2_S-3_R1_P0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 25000,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 2,
|
20 |
+
"l1_exp": -3,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
|
27 |
+
"project_name": "attn_test",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A2_S-3_R1_P0"
|
38 |
+
}
|
attn_test/A2_S-4_R1_P0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c89d9e68cc30e96eb3466874e9c058dea7ea6e04df51a2f8be962cc8e883b022
|
3 |
+
size 153705080
|
attn_test/A2_S-4_R1_P0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 25000,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 2,
|
20 |
+
"l1_exp": -4,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
|
27 |
+
"project_name": "attn_test",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A2_S-4_R1_P0"
|
38 |
+
}
|
attn_test/A2_S-5_R1_P0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ce3bfc12d00c24c5f11bca089b3b567e8caeba847bb17b0c63122a1e4e54838b
|
3 |
+
size 153705080
|
attn_test/A2_S-5_R1_P0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 25000,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 2,
|
20 |
+
"l1_exp": -5,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
|
27 |
+
"project_name": "attn_test",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A2_S-5_R1_P0"
|
38 |
+
}
|
attn_test/A2_S-6_R1_P0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:946fb7a78c0a1bbdffdeebac539ce09e00dcfe2510f6a119a2e87ea1ae291df1
|
3 |
+
size 153705080
|
attn_test/A2_S-6_R1_P0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 25000,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 2,
|
20 |
+
"l1_exp": -6,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
|
27 |
+
"project_name": "attn_test",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A2_S-6_R1_P0"
|
38 |
+
}
|
attn_test/A2_S-7_R1_P0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:21d55f2678f24117c8963bd013ac42b6202b770f9a2423c1f55ba01967f6e670
|
3 |
+
size 153705080
|
attn_test/A2_S-7_R1_P0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 25000,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 2,
|
20 |
+
"l1_exp": -7,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
|
27 |
+
"project_name": "attn_test",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A2_S-7_R1_P0"
|
38 |
+
}
|
attn_test/A2_S-8_R1_P0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:093c51dda5c92c3b33ca29a6d2944e48ae83ff060ea812cec54ed8196e4a0fb7
|
3 |
+
size 153705080
|
attn_test/A2_S-8_R1_P0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 25000,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 2,
|
20 |
+
"l1_exp": -8,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
|
27 |
+
"project_name": "attn_test",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A2_S-8_R1_P0"
|
38 |
+
}
|
attn_test/A3_S-2_R1_P0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:93e8279dc7296f421a340036b17b96afbdccd4b3a31a60845b572962f39acaf3
|
3 |
+
size 153705080
|
attn_test/A3_S-2_R1_P0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 25000,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 3,
|
20 |
+
"l1_exp": -2,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
|
27 |
+
"project_name": "attn_test",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A3_S-2_R1_P0"
|
38 |
+
}
|
attn_test/A3_S-3_R1_P0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9924363fe283d547f84cfa283ef0e0a56aa64ae2291265951f540685cd3dc20c
|
3 |
+
size 153705080
|
attn_test/A3_S-3_R1_P0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 25000,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 3,
|
20 |
+
"l1_exp": -3,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
|
27 |
+
"project_name": "attn_test",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A3_S-3_R1_P0"
|
38 |
+
}
|
attn_test/A3_S-4_R1_P0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:774dd034c7cc62ddfc779bb3ba8f7cb4a8c0791f32b6afde75027e6477402eed
|
3 |
+
size 153705080
|
attn_test/A3_S-4_R1_P0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 25000,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 3,
|
20 |
+
"l1_exp": -4,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
|
27 |
+
"project_name": "attn_test",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A3_S-4_R1_P0"
|
38 |
+
}
|
attn_test/A3_S-5_R1_P0.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2b6bc4b24e527dd772f059545765a0d3041b29d6c57ebb99d15d0a6a49ca1be0
|
3 |
+
size 153705080
|
attn_test/A3_S-5_R1_P0_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"n_features": 25000,
|
3 |
+
"d_model": 768,
|
4 |
+
"lr_exp": -10,
|
5 |
+
"disable_comet": false,
|
6 |
+
"per_neuron_reinit_interval": 0,
|
7 |
+
"reservoir_time_discount": 0.995,
|
8 |
+
"reinit_interval": 800,
|
9 |
+
"max_reinit_neurons": 5000,
|
10 |
+
"reservoir_size": 5000,
|
11 |
+
"n_piles": 292,
|
12 |
+
"log_interval": 200,
|
13 |
+
"reinit_input_norm": "target_scaled",
|
14 |
+
"reinit_input": "error",
|
15 |
+
"reinit_norm_alpha": 0.3,
|
16 |
+
"data_loc": "attn_data",
|
17 |
+
"reinit_threshold": -6,
|
18 |
+
"scheduler": "wsd",
|
19 |
+
"layer_idx": 3,
|
20 |
+
"l1_exp": -5,
|
21 |
+
"neuron_reinit_percent": 0.85,
|
22 |
+
"beta1": 1,
|
23 |
+
"beta2": 4,
|
24 |
+
"reinit_target": "error",
|
25 |
+
"sparse_adam": false,
|
26 |
+
"run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
|
27 |
+
"project_name": "attn_test",
|
28 |
+
"decoder_bias": true,
|
29 |
+
"l1_beta": 0.99,
|
30 |
+
"alt_sparsity_loss": "log",
|
31 |
+
"l1_ratio": 1,
|
32 |
+
"l1_p": 0,
|
33 |
+
"optimizer": "sparse_adam",
|
34 |
+
"model_type": "attn_out",
|
35 |
+
"adam_beta1": 0.5,
|
36 |
+
"adam_beta2": 0.9375,
|
37 |
+
"run_name": "A3_S-5_R1_P0"
|
38 |
+
}
|