noanabeshima
/

tiny_model

Model card Files Files and versions Community

noanabeshima commited on Jul 4

Commit

de1867f

•

1 Parent(s): 7fedee0

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

attn_test/A0_S-2_R1_P0.pt +3 -0
attn_test/A0_S-2_R1_P0_config.json +38 -0
attn_test/A0_S-3_R1_P0.pt +3 -0
attn_test/A0_S-3_R1_P0_config.json +38 -0
attn_test/A0_S-4_R1_P0.pt +3 -0
attn_test/A0_S-4_R1_P0_config.json +38 -0
attn_test/A0_S-5_R1_P0.pt +3 -0
attn_test/A0_S-5_R1_P0_config.json +38 -0
attn_test/A0_S-6_R1_P0.pt +3 -0
attn_test/A0_S-6_R1_P0_config.json +38 -0
attn_test/A0_S-7_R1_P0.pt +3 -0
attn_test/A0_S-7_R1_P0_config.json +38 -0
attn_test/A0_S-8_R1_P0.pt +3 -0
attn_test/A0_S-8_R1_P0_config.json +38 -0
attn_test/A1_S-2_R1_P0.pt +3 -0
attn_test/A1_S-2_R1_P0_config.json +38 -0
attn_test/A1_S-3_R1_P0.pt +3 -0
attn_test/A1_S-3_R1_P0_config.json +38 -0
attn_test/A1_S-4_R1_P0.pt +3 -0
attn_test/A1_S-4_R1_P0_config.json +38 -0
attn_test/A1_S-5_R1_P0.pt +3 -0
attn_test/A1_S-5_R1_P0_config.json +38 -0
attn_test/A1_S-6_R1_P0.pt +3 -0
attn_test/A1_S-6_R1_P0_config.json +38 -0
attn_test/A1_S-7_R1_P0.pt +3 -0
attn_test/A1_S-7_R1_P0_config.json +38 -0
attn_test/A1_S-8_R1_P0.pt +3 -0
attn_test/A1_S-8_R1_P0_config.json +38 -0
attn_test/A2_S-2_R1_P0.pt +3 -0
attn_test/A2_S-2_R1_P0_config.json +38 -0
attn_test/A2_S-3_R1_P0.pt +3 -0
attn_test/A2_S-3_R1_P0_config.json +38 -0
attn_test/A2_S-4_R1_P0.pt +3 -0
attn_test/A2_S-4_R1_P0_config.json +38 -0
attn_test/A2_S-5_R1_P0.pt +3 -0
attn_test/A2_S-5_R1_P0_config.json +38 -0
attn_test/A2_S-6_R1_P0.pt +3 -0
attn_test/A2_S-6_R1_P0_config.json +38 -0
attn_test/A2_S-7_R1_P0.pt +3 -0
attn_test/A2_S-7_R1_P0_config.json +38 -0
attn_test/A2_S-8_R1_P0.pt +3 -0
attn_test/A2_S-8_R1_P0_config.json +38 -0
attn_test/A3_S-2_R1_P0.pt +3 -0
attn_test/A3_S-2_R1_P0_config.json +38 -0
attn_test/A3_S-3_R1_P0.pt +3 -0
attn_test/A3_S-3_R1_P0_config.json +38 -0
attn_test/A3_S-4_R1_P0.pt +3 -0
attn_test/A3_S-4_R1_P0_config.json +38 -0
attn_test/A3_S-5_R1_P0.pt +3 -0
attn_test/A3_S-5_R1_P0_config.json +38 -0

attn_test/A0_S-2_R1_P0.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6a967d32a0cf8724fa9fe4d2a6aac4925037c75c5307891546056f5fd04dc9fd
+size 153705080

attn_test/A0_S-2_R1_P0_config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 0,
+  "l1_exp": -2,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A0_S-2_R1_P0"
+}

attn_test/A0_S-3_R1_P0.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f1b7ccabe7a4a3e5fa5811c6013c57228362bf80734afcbecf1b567569e5f542
+size 153705080

attn_test/A0_S-3_R1_P0_config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 0,
+  "l1_exp": -3,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A0_S-3_R1_P0"
+}

attn_test/A0_S-4_R1_P0.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:df8d6af259884d54c4fe117c8b4bff09a04e049187cd6a29f16a1445a09d656c
+size 153705080

attn_test/A0_S-4_R1_P0_config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 0,
+  "l1_exp": -4,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A0_S-4_R1_P0"
+}

attn_test/A0_S-5_R1_P0.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3225b3849eaf2b0526808aa02f477084cd39751145125b50345778b11e78d21d
+size 153705080

attn_test/A0_S-5_R1_P0_config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 0,
+  "l1_exp": -5,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A0_S-5_R1_P0"
+}

attn_test/A0_S-6_R1_P0.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1aa1970e842e927e4f1c02dee66572818ffea6bc4a40d7d97e62e57d4d95356e
+size 153705080

attn_test/A0_S-6_R1_P0_config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 0,
+  "l1_exp": -6,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A0_S-6_R1_P0"
+}

attn_test/A0_S-7_R1_P0.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e63474090a94d6ca4dc76f1198a407692960dc5199abad46ee8409307390d924
+size 153705080

attn_test/A0_S-7_R1_P0_config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 0,
+  "l1_exp": -7,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A0_S-7_R1_P0"
+}

attn_test/A0_S-8_R1_P0.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4b7a0ae2e87296395587c99f6d6dda887ce2a22211e868959cd4184c7b32dde7
+size 153705080

attn_test/A0_S-8_R1_P0_config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 0,
+  "l1_exp": -8,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A0_S-8_R1_P0"
+}

attn_test/A1_S-2_R1_P0.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9da8e0ed7db2cfd12e35d74f726b1284295347b9695a965f7ddcaffb0449399e
+size 153705080

attn_test/A1_S-2_R1_P0_config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 1,
+  "l1_exp": -2,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A1_S-2_R1_P0"
+}

attn_test/A1_S-3_R1_P0.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eaa30e7dcfdc66166c34f3b63d3c02048c5f7bbcd2c35e0ed6ab7a1c0213d3b5
+size 153705080

attn_test/A1_S-3_R1_P0_config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 1,
+  "l1_exp": -3,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A1_S-3_R1_P0"
+}

attn_test/A1_S-4_R1_P0.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c39898304960fef85e4e335e2ef19aa320b9b52addf607631748cafe3af72f68
+size 153705080

attn_test/A1_S-4_R1_P0_config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 1,
+  "l1_exp": -4,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A1_S-4_R1_P0"
+}

attn_test/A1_S-5_R1_P0.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f83a42d62c42719e64afb5874fdc7a816aeb66051a9ac0700ef4bc0152de88b3
+size 153705080

attn_test/A1_S-5_R1_P0_config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 1,
+  "l1_exp": -5,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A1_S-5_R1_P0"
+}

attn_test/A1_S-6_R1_P0.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a46305261cd3ecf76a415d4b237a6b9e78fe6f98299d32dc1b72c83951e1883f
+size 153705080

attn_test/A1_S-6_R1_P0_config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 1,
+  "l1_exp": -6,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A1_S-6_R1_P0"
+}

attn_test/A1_S-7_R1_P0.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9dc01776116cda0db85b31f842153d595aba0d89557090e5ee61d10e83098bfe
+size 153705080

attn_test/A1_S-7_R1_P0_config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 1,
+  "l1_exp": -7,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A1_S-7_R1_P0"
+}

attn_test/A1_S-8_R1_P0.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f7f67148f42731c6f0417868ebbb4840f3ca38214c53a43579b7e74c2bd7498e
+size 153705080

attn_test/A1_S-8_R1_P0_config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 1,
+  "l1_exp": -8,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A1_S-8_R1_P0"
+}

attn_test/A2_S-2_R1_P0.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:82a94ba450fb8ab21ac7951ba04a152282ccc9048e082a518f865768cf735eaf
+size 153705080

attn_test/A2_S-2_R1_P0_config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 2,
+  "l1_exp": -2,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A2_S-2_R1_P0"
+}

attn_test/A2_S-3_R1_P0.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:de6f0b4dad578eab23f3a7e0373c4e2aae4cf2f23a161cf05672185687d0634d
+size 153705080

attn_test/A2_S-3_R1_P0_config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 2,
+  "l1_exp": -3,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A2_S-3_R1_P0"
+}

attn_test/A2_S-4_R1_P0.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c89d9e68cc30e96eb3466874e9c058dea7ea6e04df51a2f8be962cc8e883b022
+size 153705080

attn_test/A2_S-4_R1_P0_config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 2,
+  "l1_exp": -4,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A2_S-4_R1_P0"
+}

attn_test/A2_S-5_R1_P0.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ce3bfc12d00c24c5f11bca089b3b567e8caeba847bb17b0c63122a1e4e54838b
+size 153705080

attn_test/A2_S-5_R1_P0_config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 2,
+  "l1_exp": -5,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A2_S-5_R1_P0"
+}

attn_test/A2_S-6_R1_P0.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:946fb7a78c0a1bbdffdeebac539ce09e00dcfe2510f6a119a2e87ea1ae291df1
+size 153705080

attn_test/A2_S-6_R1_P0_config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 2,
+  "l1_exp": -6,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A2_S-6_R1_P0"
+}

attn_test/A2_S-7_R1_P0.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:21d55f2678f24117c8963bd013ac42b6202b770f9a2423c1f55ba01967f6e670
+size 153705080

attn_test/A2_S-7_R1_P0_config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 2,
+  "l1_exp": -7,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A2_S-7_R1_P0"
+}

attn_test/A2_S-8_R1_P0.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:093c51dda5c92c3b33ca29a6d2944e48ae83ff060ea812cec54ed8196e4a0fb7
+size 153705080

attn_test/A2_S-8_R1_P0_config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 2,
+  "l1_exp": -8,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A2_S-8_R1_P0"
+}

attn_test/A3_S-2_R1_P0.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:93e8279dc7296f421a340036b17b96afbdccd4b3a31a60845b572962f39acaf3
+size 153705080

attn_test/A3_S-2_R1_P0_config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 3,
+  "l1_exp": -2,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A3_S-2_R1_P0"
+}

attn_test/A3_S-3_R1_P0.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9924363fe283d547f84cfa283ef0e0a56aa64ae2291265951f540685cd3dc20c
+size 153705080

attn_test/A3_S-3_R1_P0_config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 3,
+  "l1_exp": -3,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A3_S-3_R1_P0"
+}

attn_test/A3_S-4_R1_P0.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:774dd034c7cc62ddfc779bb3ba8f7cb4a8c0791f32b6afde75027e6477402eed
+size 153705080

attn_test/A3_S-4_R1_P0_config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 3,
+  "l1_exp": -4,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A3_S-4_R1_P0"
+}

attn_test/A3_S-5_R1_P0.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2b6bc4b24e527dd772f059545765a0d3041b29d6c57ebb99d15d0a6a49ca1be0
+size 153705080

attn_test/A3_S-5_R1_P0_config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "n_features": 25000,
+  "d_model": 768,
+  "lr_exp": -10,
+  "disable_comet": false,
+  "per_neuron_reinit_interval": 0,
+  "reservoir_time_discount": 0.995,
+  "reinit_interval": 800,
+  "max_reinit_neurons": 5000,
+  "reservoir_size": 5000,
+  "n_piles": 292,
+  "log_interval": 200,
+  "reinit_input_norm": "target_scaled",
+  "reinit_input": "error",
+  "reinit_norm_alpha": 0.3,
+  "data_loc": "attn_data",
+  "reinit_threshold": -6,
+  "scheduler": "wsd",
+  "layer_idx": 3,
+  "l1_exp": -5,
+  "neuron_reinit_percent": 0.85,
+  "beta1": 1,
+  "beta2": 4,
+  "reinit_target": "error",
+  "sparse_adam": false,
+  "run_template": "A{layer_idx}_S{l1_exp}_R{l1_ratio}_P{l1_p}",
+  "project_name": "attn_test",
+  "decoder_bias": true,
+  "l1_beta": 0.99,
+  "alt_sparsity_loss": "log",
+  "l1_ratio": 1,
+  "l1_p": 0,
+  "optimizer": "sparse_adam",
+  "model_type": "attn_out",
+  "adam_beta1": 0.5,
+  "adam_beta2": 0.9375,
+  "run_name": "A3_S-5_R1_P0"
+}