mineself2016 commited on
Commit
85971b0
1 Parent(s): 266ff7b

Add pretrained model

Browse files
config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "MambaModel"
4
+ ],
5
+ "d_model": 512,
6
+ "mamba_layer": 24,
7
+ "torch_dtype": "float32",
8
+ "transformers_version": "4.40.2",
9
+ "vocab_size": 25426
10
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:07a8347e2037f04f81aa44c66249be1a046ddb99a880d66005d8e4e64a099689
3
+ size 262998656
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac7b77ee39b6b682a8ee916e75fda7013310e73d61a43147d2c2115d7a0f8e9f
3
+ size 526118394
rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08c0461ead1d086b6a7b05ee652866e36e6149706b273358575cd003c7a3c74f
3
+ size 14960
rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6535ceff04190f148d8ab8cf37f923d0d48880339c08388a16cbac532cacaa26
3
+ size 14960
rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ff44d5a772eb074ac4c7038ce3ad928f186c1109b33721d9e18813b8ea9821d
3
+ size 14960
rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd493093e5f3d375ac8690daf11ca272b0923a4906fa47075345bbccd33df4ce
3
+ size 14960
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c98faabda8bc1e24341e61d274292ecafdf6fa257df7d8331e9214e8b4643aa0
3
+ size 1064
special_tokens_map.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "pad_token": "[PAD]",
3
+ "unk_token": "[UNK]"
4
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {},
3
+ "clean_up_tokenization_spaces": true,
4
+ "model_max_length": 1000000000000000019884624838656,
5
+ "pad_token": "[PAD]",
6
+ "tokenizer_class": "PreTrainedTokenizerFast",
7
+ "unk_token": "[UNK]"
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
+ "eval_steps": 500,
6
+ "global_step": 31250,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.04,
13
+ "grad_norm": 0.00015208835247904062,
14
+ "learning_rate": 4.8e-05,
15
+ "loss": 0.0005,
16
+ "step": 1250
17
+ },
18
+ {
19
+ "epoch": 0.08,
20
+ "grad_norm": 0.000780147616751492,
21
+ "learning_rate": 4.600000000000001e-05,
22
+ "loss": 0.0001,
23
+ "step": 2500
24
+ },
25
+ {
26
+ "epoch": 0.12,
27
+ "grad_norm": 2.6835205062525347e-05,
28
+ "learning_rate": 4.4000000000000006e-05,
29
+ "loss": 0.0002,
30
+ "step": 3750
31
+ },
32
+ {
33
+ "epoch": 0.16,
34
+ "grad_norm": 0.0013310563517734408,
35
+ "learning_rate": 4.2e-05,
36
+ "loss": 0.0,
37
+ "step": 5000
38
+ },
39
+ {
40
+ "epoch": 0.2,
41
+ "grad_norm": 2.0090383259230293e-05,
42
+ "learning_rate": 4e-05,
43
+ "loss": 0.0,
44
+ "step": 6250
45
+ },
46
+ {
47
+ "epoch": 0.24,
48
+ "grad_norm": 0.0005068681784905493,
49
+ "learning_rate": 3.8e-05,
50
+ "loss": 0.0001,
51
+ "step": 7500
52
+ },
53
+ {
54
+ "epoch": 0.28,
55
+ "grad_norm": 0.0011675615096464753,
56
+ "learning_rate": 3.6e-05,
57
+ "loss": 0.0004,
58
+ "step": 8750
59
+ },
60
+ {
61
+ "epoch": 0.32,
62
+ "grad_norm": 1.4769906556466594e-05,
63
+ "learning_rate": 3.4000000000000007e-05,
64
+ "loss": 0.0,
65
+ "step": 10000
66
+ },
67
+ {
68
+ "epoch": 0.36,
69
+ "grad_norm": 0.000697318697348237,
70
+ "learning_rate": 3.2000000000000005e-05,
71
+ "loss": 0.0,
72
+ "step": 11250
73
+ },
74
+ {
75
+ "epoch": 0.4,
76
+ "grad_norm": 0.0014425746630877256,
77
+ "learning_rate": 3e-05,
78
+ "loss": 0.0,
79
+ "step": 12500
80
+ },
81
+ {
82
+ "epoch": 0.44,
83
+ "grad_norm": 5.298478572512977e-05,
84
+ "learning_rate": 2.8000000000000003e-05,
85
+ "loss": 0.0,
86
+ "step": 13750
87
+ },
88
+ {
89
+ "epoch": 0.48,
90
+ "grad_norm": 2.2371379600372165e-05,
91
+ "learning_rate": 2.6000000000000002e-05,
92
+ "loss": 0.0,
93
+ "step": 15000
94
+ },
95
+ {
96
+ "epoch": 0.52,
97
+ "grad_norm": 0.0001511627488071099,
98
+ "learning_rate": 2.4e-05,
99
+ "loss": 0.0,
100
+ "step": 16250
101
+ },
102
+ {
103
+ "epoch": 0.56,
104
+ "grad_norm": 0.0011282784398645163,
105
+ "learning_rate": 2.2000000000000003e-05,
106
+ "loss": 0.0,
107
+ "step": 17500
108
+ },
109
+ {
110
+ "epoch": 0.6,
111
+ "grad_norm": 6.4835912780836225e-06,
112
+ "learning_rate": 2e-05,
113
+ "loss": 0.0,
114
+ "step": 18750
115
+ },
116
+ {
117
+ "epoch": 0.64,
118
+ "grad_norm": 6.069922619644785e-06,
119
+ "learning_rate": 1.8e-05,
120
+ "loss": 0.0,
121
+ "step": 20000
122
+ },
123
+ {
124
+ "epoch": 0.68,
125
+ "grad_norm": 3.5463058338791598e-06,
126
+ "learning_rate": 1.6000000000000003e-05,
127
+ "loss": 0.0,
128
+ "step": 21250
129
+ },
130
+ {
131
+ "epoch": 0.72,
132
+ "grad_norm": 1.4286022633314133e-05,
133
+ "learning_rate": 1.4000000000000001e-05,
134
+ "loss": 0.0,
135
+ "step": 22500
136
+ },
137
+ {
138
+ "epoch": 0.76,
139
+ "grad_norm": 5.859881639480591e-06,
140
+ "learning_rate": 1.2e-05,
141
+ "loss": 0.0,
142
+ "step": 23750
143
+ },
144
+ {
145
+ "epoch": 0.8,
146
+ "grad_norm": 7.3442338361928705e-06,
147
+ "learning_rate": 1e-05,
148
+ "loss": 0.0,
149
+ "step": 25000
150
+ },
151
+ {
152
+ "epoch": 0.84,
153
+ "grad_norm": 8.827374585962389e-06,
154
+ "learning_rate": 8.000000000000001e-06,
155
+ "loss": 0.0,
156
+ "step": 26250
157
+ },
158
+ {
159
+ "epoch": 0.88,
160
+ "grad_norm": 5.32037984157796e-06,
161
+ "learning_rate": 6e-06,
162
+ "loss": 0.0,
163
+ "step": 27500
164
+ },
165
+ {
166
+ "epoch": 0.92,
167
+ "grad_norm": 2.4539526748412754e-06,
168
+ "learning_rate": 4.000000000000001e-06,
169
+ "loss": 0.0,
170
+ "step": 28750
171
+ },
172
+ {
173
+ "epoch": 0.96,
174
+ "grad_norm": 4.904304205410881e-06,
175
+ "learning_rate": 2.0000000000000003e-06,
176
+ "loss": 0.0,
177
+ "step": 30000
178
+ },
179
+ {
180
+ "epoch": 1.0,
181
+ "grad_norm": 2.830231323969201e-06,
182
+ "learning_rate": 0.0,
183
+ "loss": 0.0,
184
+ "step": 31250
185
+ }
186
+ ],
187
+ "logging_steps": 1250,
188
+ "max_steps": 31250,
189
+ "num_input_tokens_seen": 0,
190
+ "num_train_epochs": 1,
191
+ "save_steps": 1250,
192
+ "total_flos": 1.2558389540626104e+18,
193
+ "train_batch_size": 16,
194
+ "trial_name": null,
195
+ "trial_params": null
196
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1b2c297b9f6ca56c10529b42fa95f5ef87ec325355dee476ba664a2fd52d5cd
3
+ size 5048