Commit
•
d1017b4
0
Parent(s):
Duplicate from jbochi/madlad400-8b-lm
Browse filesCo-authored-by: J Bochi <jbochi@users.noreply.huggingface.co>
- .gitattributes +36 -0
- README.md +462 -0
- added_tokens.json +2 -0
- config.json +39 -0
- decoderonlyt5_config.py +11 -0
- decoderonlyt5_modeling.py +840 -0
- model-00000-of-00007.safetensors +3 -0
- model-00001-of-00007.safetensors +3 -0
- model-00002-of-00007.safetensors +3 -0
- model-00003-of-00007.safetensors +3 -0
- model-00004-of-00007.safetensors +3 -0
- model-00005-of-00007.safetensors +3 -0
- model-00006-of-00007.safetensors +3 -0
- model-00007-of-00007.safetensors +3 -0
- model.safetensors.index.json +262 -0
- special_tokens_map.json +23 -0
- spiece.model +3 -0
- tokenizer.json +3 -0
- tokenizer_config.json +38 -0
.gitattributes
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
@@ -0,0 +1,462 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: apache-2.0
|
3 |
+
language:
|
4 |
+
- en
|
5 |
+
- ru
|
6 |
+
- es
|
7 |
+
- fr
|
8 |
+
- de
|
9 |
+
- it
|
10 |
+
- pt
|
11 |
+
- pl
|
12 |
+
- nl
|
13 |
+
- vi
|
14 |
+
- tr
|
15 |
+
- sv
|
16 |
+
- id
|
17 |
+
- ro
|
18 |
+
- cs
|
19 |
+
- zh
|
20 |
+
- hu
|
21 |
+
- ja
|
22 |
+
- th
|
23 |
+
- fi
|
24 |
+
- fa
|
25 |
+
- uk
|
26 |
+
- da
|
27 |
+
- el
|
28 |
+
- "no"
|
29 |
+
- bg
|
30 |
+
- sk
|
31 |
+
- ko
|
32 |
+
- ar
|
33 |
+
- lt
|
34 |
+
- ca
|
35 |
+
- sl
|
36 |
+
- he
|
37 |
+
- et
|
38 |
+
- lv
|
39 |
+
- hi
|
40 |
+
- sq
|
41 |
+
- ms
|
42 |
+
- az
|
43 |
+
- sr
|
44 |
+
- ta
|
45 |
+
- hr
|
46 |
+
- kk
|
47 |
+
- is
|
48 |
+
- ml
|
49 |
+
- mr
|
50 |
+
- te
|
51 |
+
- af
|
52 |
+
- gl
|
53 |
+
- fil
|
54 |
+
- be
|
55 |
+
- mk
|
56 |
+
- eu
|
57 |
+
- bn
|
58 |
+
- ka
|
59 |
+
- mn
|
60 |
+
- bs
|
61 |
+
- uz
|
62 |
+
- ur
|
63 |
+
- sw
|
64 |
+
- yue
|
65 |
+
- ne
|
66 |
+
- kn
|
67 |
+
- kaa
|
68 |
+
- gu
|
69 |
+
- si
|
70 |
+
- cy
|
71 |
+
- eo
|
72 |
+
- la
|
73 |
+
- hy
|
74 |
+
- ky
|
75 |
+
- tg
|
76 |
+
- ga
|
77 |
+
- mt
|
78 |
+
- my
|
79 |
+
- km
|
80 |
+
- tt
|
81 |
+
- so
|
82 |
+
- ku
|
83 |
+
- ps
|
84 |
+
- pa
|
85 |
+
- rw
|
86 |
+
- lo
|
87 |
+
- ha
|
88 |
+
- dv
|
89 |
+
- fy
|
90 |
+
- lb
|
91 |
+
- ckb
|
92 |
+
- mg
|
93 |
+
- gd
|
94 |
+
- am
|
95 |
+
- ug
|
96 |
+
- ht
|
97 |
+
- grc
|
98 |
+
- hmn
|
99 |
+
- sd
|
100 |
+
- jv
|
101 |
+
- mi
|
102 |
+
- tk
|
103 |
+
- ceb
|
104 |
+
- yi
|
105 |
+
- ba
|
106 |
+
- fo
|
107 |
+
- or
|
108 |
+
- xh
|
109 |
+
- su
|
110 |
+
- kl
|
111 |
+
- ny
|
112 |
+
- sm
|
113 |
+
- sn
|
114 |
+
- co
|
115 |
+
- zu
|
116 |
+
- ig
|
117 |
+
- yo
|
118 |
+
- pap
|
119 |
+
- st
|
120 |
+
- haw
|
121 |
+
- as
|
122 |
+
- oc
|
123 |
+
- cv
|
124 |
+
- lus
|
125 |
+
- tet
|
126 |
+
- gsw
|
127 |
+
- sah
|
128 |
+
- br
|
129 |
+
- rm
|
130 |
+
- sa
|
131 |
+
- bo
|
132 |
+
- om
|
133 |
+
- se
|
134 |
+
- ce
|
135 |
+
- cnh
|
136 |
+
- ilo
|
137 |
+
- hil
|
138 |
+
- udm
|
139 |
+
- os
|
140 |
+
- lg
|
141 |
+
- ti
|
142 |
+
- vec
|
143 |
+
- ts
|
144 |
+
- tyv
|
145 |
+
- kbd
|
146 |
+
- ee
|
147 |
+
- iba
|
148 |
+
- av
|
149 |
+
- kha
|
150 |
+
- to
|
151 |
+
- tn
|
152 |
+
- nso
|
153 |
+
- fj
|
154 |
+
- zza
|
155 |
+
- ak
|
156 |
+
- ada
|
157 |
+
- otq
|
158 |
+
- dz
|
159 |
+
- bua
|
160 |
+
- cfm
|
161 |
+
- ln
|
162 |
+
- chm
|
163 |
+
- gn
|
164 |
+
- krc
|
165 |
+
- wa
|
166 |
+
- hif
|
167 |
+
- yua
|
168 |
+
- srn
|
169 |
+
- war
|
170 |
+
- rom
|
171 |
+
- bik
|
172 |
+
- pam
|
173 |
+
- sg
|
174 |
+
- lu
|
175 |
+
- ady
|
176 |
+
- kbp
|
177 |
+
- syr
|
178 |
+
- ltg
|
179 |
+
- myv
|
180 |
+
- iso
|
181 |
+
- kac
|
182 |
+
- bho
|
183 |
+
- ay
|
184 |
+
- kum
|
185 |
+
- qu
|
186 |
+
- za
|
187 |
+
- pag
|
188 |
+
- ngu
|
189 |
+
- ve
|
190 |
+
- pck
|
191 |
+
- zap
|
192 |
+
- tyz
|
193 |
+
- hui
|
194 |
+
- bbc
|
195 |
+
- tzo
|
196 |
+
- tiv
|
197 |
+
- ksd
|
198 |
+
- gom
|
199 |
+
- min
|
200 |
+
- ang
|
201 |
+
- nhe
|
202 |
+
- bgp
|
203 |
+
- nzi
|
204 |
+
- nnb
|
205 |
+
- nv
|
206 |
+
- zxx
|
207 |
+
- bci
|
208 |
+
- kv
|
209 |
+
- new
|
210 |
+
- mps
|
211 |
+
- alt
|
212 |
+
- meu
|
213 |
+
- bew
|
214 |
+
- fon
|
215 |
+
- iu
|
216 |
+
- abt
|
217 |
+
- mgh
|
218 |
+
- mnw
|
219 |
+
- tvl
|
220 |
+
- dov
|
221 |
+
- tlh
|
222 |
+
- ho
|
223 |
+
- kw
|
224 |
+
- mrj
|
225 |
+
- meo
|
226 |
+
- crh
|
227 |
+
- mbt
|
228 |
+
- emp
|
229 |
+
- ace
|
230 |
+
- ium
|
231 |
+
- mam
|
232 |
+
- gym
|
233 |
+
- mai
|
234 |
+
- crs
|
235 |
+
- pon
|
236 |
+
- ubu
|
237 |
+
- fip
|
238 |
+
- quc
|
239 |
+
- gv
|
240 |
+
- kj
|
241 |
+
- btx
|
242 |
+
- ape
|
243 |
+
- chk
|
244 |
+
- rcf
|
245 |
+
- shn
|
246 |
+
- tzh
|
247 |
+
- mdf
|
248 |
+
- ppk
|
249 |
+
- ss
|
250 |
+
- gag
|
251 |
+
- cab
|
252 |
+
- kri
|
253 |
+
- seh
|
254 |
+
- ibb
|
255 |
+
- tbz
|
256 |
+
- bru
|
257 |
+
- enq
|
258 |
+
- ach
|
259 |
+
- cuk
|
260 |
+
- kmb
|
261 |
+
- wo
|
262 |
+
- kek
|
263 |
+
- qub
|
264 |
+
- tab
|
265 |
+
- bts
|
266 |
+
- kos
|
267 |
+
- rwo
|
268 |
+
- cak
|
269 |
+
- tuc
|
270 |
+
- bum
|
271 |
+
- cjk
|
272 |
+
- gil
|
273 |
+
- stq
|
274 |
+
- tsg
|
275 |
+
- quh
|
276 |
+
- mak
|
277 |
+
- arn
|
278 |
+
- ban
|
279 |
+
- jiv
|
280 |
+
- sja
|
281 |
+
- yap
|
282 |
+
- tcy
|
283 |
+
- toj
|
284 |
+
- twu
|
285 |
+
- xal
|
286 |
+
- amu
|
287 |
+
- rmc
|
288 |
+
- hus
|
289 |
+
- nia
|
290 |
+
- kjh
|
291 |
+
- bm
|
292 |
+
- guh
|
293 |
+
- mas
|
294 |
+
- acf
|
295 |
+
- dtp
|
296 |
+
- ksw
|
297 |
+
- bzj
|
298 |
+
- din
|
299 |
+
- zne
|
300 |
+
- mad
|
301 |
+
- msi
|
302 |
+
- mag
|
303 |
+
- mkn
|
304 |
+
- kg
|
305 |
+
- lhu
|
306 |
+
- ch
|
307 |
+
- qvi
|
308 |
+
- mh
|
309 |
+
- djk
|
310 |
+
- sus
|
311 |
+
- mfe
|
312 |
+
- srm
|
313 |
+
- dyu
|
314 |
+
- ctu
|
315 |
+
- gui
|
316 |
+
- pau
|
317 |
+
- inb
|
318 |
+
- bi
|
319 |
+
- mni
|
320 |
+
- guc
|
321 |
+
- jam
|
322 |
+
- wal
|
323 |
+
- jac
|
324 |
+
- bas
|
325 |
+
- gor
|
326 |
+
- skr
|
327 |
+
- nyu
|
328 |
+
- noa
|
329 |
+
- sda
|
330 |
+
- gub
|
331 |
+
- nog
|
332 |
+
- cni
|
333 |
+
- teo
|
334 |
+
- tdx
|
335 |
+
- sxn
|
336 |
+
- rki
|
337 |
+
- nr
|
338 |
+
- frp
|
339 |
+
- alz
|
340 |
+
- taj
|
341 |
+
- lrc
|
342 |
+
- cce
|
343 |
+
- rn
|
344 |
+
- jvn
|
345 |
+
- hvn
|
346 |
+
- nij
|
347 |
+
- dwr
|
348 |
+
- izz
|
349 |
+
- msm
|
350 |
+
- bus
|
351 |
+
- ktu
|
352 |
+
- chr
|
353 |
+
- maz
|
354 |
+
- tzj
|
355 |
+
- suz
|
356 |
+
- knj
|
357 |
+
- bim
|
358 |
+
- gvl
|
359 |
+
- bqc
|
360 |
+
- tca
|
361 |
+
- pis
|
362 |
+
- prk
|
363 |
+
- laj
|
364 |
+
- mel
|
365 |
+
- qxr
|
366 |
+
- niq
|
367 |
+
- ahk
|
368 |
+
- shp
|
369 |
+
- hne
|
370 |
+
- spp
|
371 |
+
- koi
|
372 |
+
- krj
|
373 |
+
- quf
|
374 |
+
- luz
|
375 |
+
- agr
|
376 |
+
- tsc
|
377 |
+
- mqy
|
378 |
+
- gof
|
379 |
+
- gbm
|
380 |
+
- miq
|
381 |
+
- dje
|
382 |
+
- awa
|
383 |
+
- bjj
|
384 |
+
- qvz
|
385 |
+
- sjp
|
386 |
+
- tll
|
387 |
+
- raj
|
388 |
+
- kjg
|
389 |
+
- bgz
|
390 |
+
- quy
|
391 |
+
- cbk
|
392 |
+
- akb
|
393 |
+
- oj
|
394 |
+
- ify
|
395 |
+
- mey
|
396 |
+
- ks
|
397 |
+
- cac
|
398 |
+
- brx
|
399 |
+
- qup
|
400 |
+
- syl
|
401 |
+
- jax
|
402 |
+
- ff
|
403 |
+
- ber
|
404 |
+
- tks
|
405 |
+
- trp
|
406 |
+
- mrw
|
407 |
+
- adh
|
408 |
+
- smt
|
409 |
+
- srr
|
410 |
+
- ffm
|
411 |
+
- qvc
|
412 |
+
- mtr
|
413 |
+
- ann
|
414 |
+
- kaa
|
415 |
+
- aa
|
416 |
+
- noe
|
417 |
+
- nut
|
418 |
+
- gyn
|
419 |
+
- kwi
|
420 |
+
- xmm
|
421 |
+
- msb
|
422 |
+
library_name: transformers
|
423 |
+
tags:
|
424 |
+
- text-generation-inference
|
425 |
+
datasets:
|
426 |
+
- allenai/MADLAD-400
|
427 |
+
---
|
428 |
+
|
429 |
+
This model has the safetensors weights for the [Madlad-400](https://github.com/google-research/google-research/tree/master/madlad_400) 8B param **language model**.
|
430 |
+
|
431 |
+
The HF transformers code to run inference is not ready yet. The [original implementation](https://github.com/google/flaxformer/blob/ea17eb012a1d340ddff017b7a534c2162aaec34c/flaxformer/architectures/t5/t5_architecture.py#L1484) is in JAX/Flaxformer.
|
432 |
+
|
433 |
+
The model architecture is the same as [Palm 8B](https://arxiv.org/pdf/2204.02311.pdf).
|
434 |
+
|
435 |
+
It's a decoder-only T5 with 32 layers, 16 query heads, 1 KV head, and 4096 embedding size.
|
436 |
+
|
437 |
+
These are the main differences relative to the original T5 architecture:
|
438 |
+
|
439 |
+
- SwiGLU Activation
|
440 |
+
- Parallel Layers
|
441 |
+
- Multi-Query Attention
|
442 |
+
- RoPE Embeddings
|
443 |
+
- Shared Input-Output Embeddings
|
444 |
+
- No biases
|
445 |
+
- Bidirectional attention
|
446 |
+
- Layer Norm with `center_scale_at_zero` and final layer with `use_scale=False`
|
447 |
+
|
448 |
+
If you are looking for the language models models, here are the available versions:
|
449 |
+
- [3B](https://huggingface.co/jbochi/madlad400-3b-mt)
|
450 |
+
- [7B](https://huggingface.co/jbochi/madlad400-7b-mt)
|
451 |
+
- [7B-BT](https://huggingface.co/jbochi/madlad400-7b-mt-bt)
|
452 |
+
- [10B](https://huggingface.co/jbochi/madlad400-10b-mt)
|
453 |
+
|
454 |
+
|
455 |
+
Article: [MADLAD-400: A Multilingual And Document-Level Large Audited Dataset](https://arxiv.org/abs/2309.04662)
|
456 |
+
|
457 |
+
Abstract:
|
458 |
+
|
459 |
+
> We introduce MADLAD-400, a manually audited, general domain 3T token monolingual dataset based on CommonCrawl, spanning 419 languages. We discuss the limitations revealed by self-auditing MADLAD-400, and the role data auditing had in the dataset creation process. We then train and release a 10.7B-parameter multilingual machine translation model on 250 billion tokens covering over 450 languages using publicly available data, and find that it is competitive with models that are significantly larger, and report the results on different domains. In addition, we train a 8B-parameter language model, and assess the results on few-shot translation. We make the baseline models available to the research community.
|
460 |
+
|
461 |
+
|
462 |
+
|
added_tokens.json
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
}
|
config.json
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"DecoderOnlyT5Model"
|
4 |
+
],
|
5 |
+
"auto_map": {
|
6 |
+
"AutoConfig": "decoderonlyt5_config.DecoderOnlyT5Config",
|
7 |
+
"AutoModelForCausalLM": "decoderonlyt5_modeling.DecoderOnlyT5Model"
|
8 |
+
},
|
9 |
+
"d_ff": 16384,
|
10 |
+
"d_kv": 256,
|
11 |
+
"d_model": 4096,
|
12 |
+
"dropout_rate": 0.0,
|
13 |
+
"decoder_start_token_id": 0,
|
14 |
+
"pad_token_id": 1,
|
15 |
+
"eos_token_id": 3,
|
16 |
+
"feed_forward_proj": "gated-swish",
|
17 |
+
"initializer_factor": 1.0,
|
18 |
+
"is_encoder_decoder": false,
|
19 |
+
"is_decoder_only": true,
|
20 |
+
"layer_norm_epsilon": 1e-06,
|
21 |
+
"model_type": "t5",
|
22 |
+
"n_positions": 512,
|
23 |
+
"num_layers": 0,
|
24 |
+
"num_decoder_layers": 32,
|
25 |
+
"num_heads": 16,
|
26 |
+
"output_past": true,
|
27 |
+
"relative_attention_max_distance": 128,
|
28 |
+
"relative_attention_num_buckets": 32,
|
29 |
+
"task_specific_params": {},
|
30 |
+
"tie_word_embeddings": true,
|
31 |
+
"transformers_version": "4.23.1",
|
32 |
+
"use_cache": true,
|
33 |
+
"vocab_size": 256512,
|
34 |
+
"parallel_layers": true,
|
35 |
+
"has_relative_attention_bias": false,
|
36 |
+
"multi_query_attention": true,
|
37 |
+
"use_rotary_embedding": true,
|
38 |
+
"rotary_embedding_max_timescale": 1000
|
39 |
+
}
|
decoderonlyt5_config.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers.models.t5.configuration_t5 import T5Config
|
2 |
+
|
3 |
+
|
4 |
+
class DecoderOnlyT5Config(T5Config):
|
5 |
+
is_decoder_only = True
|
6 |
+
# whether to call attention and mlp in parallel.
|
7 |
+
# https://github.com/google/flaxformer/blob/ea17eb012a1d340ddff017b7a534c2162aaec34c/flaxformer/architectures/t5/t5_architecture.py#L384
|
8 |
+
parallel_layers = True
|
9 |
+
has_relative_attention_bias = False
|
10 |
+
# https://arxiv.org/abs/1911.02150
|
11 |
+
multi_query_attention = True
|
decoderonlyt5_modeling.py
ADDED
@@ -0,0 +1,840 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import copy
|
2 |
+
import math
|
3 |
+
from typing import Optional, Tuple, Union
|
4 |
+
|
5 |
+
import torch
|
6 |
+
from torch import nn
|
7 |
+
from torch.nn import CrossEntropyLoss
|
8 |
+
from transformers.models.t5 import modeling_t5
|
9 |
+
from transformers.modeling_outputs import CausalLMOutputWithPast
|
10 |
+
from transformers.utils import (
|
11 |
+
add_start_docstrings_to_model_forward,
|
12 |
+
logging,
|
13 |
+
replace_return_docstrings,
|
14 |
+
)
|
15 |
+
|
16 |
+
from .decoderonlyt5_config import DecoderOnlyT5Config
|
17 |
+
|
18 |
+
|
19 |
+
logger = logging.get_logger(__name__)
|
20 |
+
_CONFIG_FOR_DOC = "DecoderOnlyT5Config"
|
21 |
+
|
22 |
+
|
23 |
+
class DecoderOnlyT5LayerNorm(nn.Module):
|
24 |
+
def __init__(self, hidden_size, eps=1e-6, use_scale=True, center_scale_at_zero=False):
|
25 |
+
"""
|
26 |
+
Construct a layernorm module in the T5 style No bias and no subtraction of mean.
|
27 |
+
"""
|
28 |
+
super().__init__()
|
29 |
+
if use_scale:
|
30 |
+
self.weight = nn.Parameter(torch.ones(hidden_size))
|
31 |
+
else:
|
32 |
+
assert not center_scale_at_zero
|
33 |
+
self.weight = None
|
34 |
+
self.center_scale_at_zero = center_scale_at_zero
|
35 |
+
self.variance_epsilon = eps
|
36 |
+
|
37 |
+
def forward(self, hidden_states):
|
38 |
+
# https://github.com/google/flaxformer/blob/ea17eb012a1d340ddff017b7a534c2162aaec34c/flaxformer/components/layer_norm.py#L30
|
39 |
+
|
40 |
+
# layer norm should always be calculated in float32
|
41 |
+
mean2 = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
|
42 |
+
hidden_states = hidden_states * torch.rsqrt(mean2 + self.variance_epsilon)
|
43 |
+
|
44 |
+
# convert into float16 if necessary
|
45 |
+
if self.weight is None:
|
46 |
+
return hidden_states
|
47 |
+
if self.weight.dtype == torch.float16:
|
48 |
+
hidden_states = hidden_states.to(torch.float16)
|
49 |
+
if self.center_scale_at_zero:
|
50 |
+
return (self.weight + 1.0) * hidden_states
|
51 |
+
else:
|
52 |
+
return self.weight * hidden_states
|
53 |
+
|
54 |
+
|
55 |
+
|
56 |
+
class DecoderOnlyT5LayerFF(modeling_t5.T5LayerFF):
|
57 |
+
def __init__(self, config: DecoderOnlyT5Config):
|
58 |
+
super(modeling_t5.T5LayerFF, self).__init__()
|
59 |
+
if config.is_gated_act:
|
60 |
+
self.DenseReluDense = modeling_t5.T5DenseGatedActDense(config)
|
61 |
+
else:
|
62 |
+
self.DenseReluDense = modeling_t5.T5DenseActDense(config)
|
63 |
+
|
64 |
+
if not config.parallel_layers:
|
65 |
+
self.layer_norm = modeling_t5.DecoderOnlyT5LayerNorm(
|
66 |
+
config.d_model, eps=config.layer_norm_epsilon
|
67 |
+
)
|
68 |
+
else:
|
69 |
+
self.layer_norm = nn.Identity()
|
70 |
+
self.dropout = nn.Dropout(config.dropout_rate)
|
71 |
+
|
72 |
+
|
73 |
+
# LlamaRotaryEmbedding
|
74 |
+
class DecoderOnlyT5RotaryEmbedding(nn.Module):
|
75 |
+
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
|
76 |
+
super().__init__()
|
77 |
+
|
78 |
+
self.dim = dim
|
79 |
+
self.max_position_embeddings = max_position_embeddings
|
80 |
+
self.base = base
|
81 |
+
inv_freq = 1.0 / (
|
82 |
+
self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)
|
83 |
+
)
|
84 |
+
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
85 |
+
|
86 |
+
# Build here to make `torch.jit.trace` work.
|
87 |
+
self._set_cos_sin_cache(
|
88 |
+
seq_len=max_position_embeddings,
|
89 |
+
device=self.inv_freq.device,
|
90 |
+
dtype=torch.get_default_dtype(),
|
91 |
+
)
|
92 |
+
|
93 |
+
def _set_cos_sin_cache(self, seq_len, device, dtype):
|
94 |
+
self.max_seq_len_cached = seq_len
|
95 |
+
t = torch.arange(
|
96 |
+
self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
|
97 |
+
)
|
98 |
+
|
99 |
+
freqs = torch.einsum("i,j->ij", t, self.inv_freq)
|
100 |
+
# Different from paper, but it uses a different permutation in order to obtain the same calculation
|
101 |
+
emb = torch.cat((freqs, freqs), dim=-1)
|
102 |
+
self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
|
103 |
+
self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
|
104 |
+
|
105 |
+
def forward(self, x, seq_len=None):
|
106 |
+
# x: [bs, num_attention_heads, seq_len, head_size]
|
107 |
+
if seq_len > self.max_seq_len_cached:
|
108 |
+
self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
|
109 |
+
|
110 |
+
return (
|
111 |
+
self.cos_cached[:seq_len].to(dtype=x.dtype),
|
112 |
+
self.sin_cached[:seq_len].to(dtype=x.dtype),
|
113 |
+
)
|
114 |
+
|
115 |
+
|
116 |
+
def rotate_half(x):
|
117 |
+
"""Rotates half the hidden dims of the input."""
|
118 |
+
x1 = x[..., : x.shape[-1] // 2]
|
119 |
+
x2 = x[..., x.shape[-1] // 2 :]
|
120 |
+
return torch.cat((-x2, x1), dim=-1)
|
121 |
+
|
122 |
+
|
123 |
+
def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
|
124 |
+
"""Applies Rotary Position Embedding to the query and key tensors.
|
125 |
+
|
126 |
+
Args:
|
127 |
+
q (`torch.Tensor`): The query tensor.
|
128 |
+
k (`torch.Tensor`): The key tensor.
|
129 |
+
cos (`torch.Tensor`): The cosine part of the rotary embedding.
|
130 |
+
sin (`torch.Tensor`): The sine part of the rotary embedding.
|
131 |
+
position_ids (`torch.Tensor`):
|
132 |
+
The position indices of the tokens corresponding to the query and key tensors. For example, this can be
|
133 |
+
used to pass offsetted position ids when working with a KV-cache.
|
134 |
+
unsqueeze_dim (`int`, *optional*, defaults to 1):
|
135 |
+
The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
|
136 |
+
sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
|
137 |
+
that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
|
138 |
+
k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
|
139 |
+
cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
|
140 |
+
the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
|
141 |
+
Returns:
|
142 |
+
`tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
|
143 |
+
"""
|
144 |
+
cos = cos[position_ids].unsqueeze(unsqueeze_dim)
|
145 |
+
sin = sin[position_ids].unsqueeze(unsqueeze_dim)
|
146 |
+
q_embed = (q * cos) + (rotate_half(q) * sin)
|
147 |
+
k_embed = (k * cos) + (rotate_half(k) * sin)
|
148 |
+
return q_embed, k_embed
|
149 |
+
|
150 |
+
|
151 |
+
# https://github.com/huggingface/transformers/blob/7ee995fd9c692761c4601ddbffa2ac2ec9f27b0b/src/transformers/models/llama/modeling_llama.py#L263
|
152 |
+
def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
|
153 |
+
"""
|
154 |
+
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
|
155 |
+
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
|
156 |
+
"""
|
157 |
+
batch, num_key_value_heads, slen, head_dim = hidden_states.shape
|
158 |
+
if n_rep == 1:
|
159 |
+
return hidden_states
|
160 |
+
hidden_states = hidden_states[:, :, None, :, :].expand(
|
161 |
+
batch, num_key_value_heads, n_rep, slen, head_dim
|
162 |
+
)
|
163 |
+
return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
|
164 |
+
|
165 |
+
|
166 |
+
class DecoderOnlyT5Attention(modeling_t5.T5Attention):
|
167 |
+
"""
|
168 |
+
Supports both multi-head and multi-query attention.
|
169 |
+
https://arxiv.org/abs/1911.02150
|
170 |
+
https://github.com/google/flaxformer/blob/ea17eb012a1d340ddff017b7a534c2162aaec34c/flaxformer/components/attention/dense_attention.py#L292
|
171 |
+
"""
|
172 |
+
|
173 |
+
def __init__(self, config: DecoderOnlyT5Config, has_relative_attention_bias=False):
|
174 |
+
super(modeling_t5.T5Attention, self).__init__()
|
175 |
+
self.is_decoder = config.is_decoder
|
176 |
+
assert not has_relative_attention_bias
|
177 |
+
assert config.use_rotary_embedding
|
178 |
+
self.d_model = config.d_model
|
179 |
+
self.head_dim = config.d_kv
|
180 |
+
self.num_heads = config.num_heads
|
181 |
+
self.num_key_value_heads = 1 if config.multi_query_attention else self.n_heads
|
182 |
+
self.num_key_value_groups = self.num_heads // self.num_key_value_heads
|
183 |
+
self.attention_dropout = config.dropout_rate
|
184 |
+
self.inner_dim = self.num_heads * self.head_dim
|
185 |
+
self.kv_inner_dim = self.num_key_value_heads * self.head_dim
|
186 |
+
self.rotary_emb = DecoderOnlyT5RotaryEmbedding(
|
187 |
+
self.head_dim,
|
188 |
+
max_position_embeddings=config.relative_attention_max_distance,
|
189 |
+
base=config.rotary_embedding_max_timescale,
|
190 |
+
)
|
191 |
+
|
192 |
+
# Mesh TensorFlow initialization to avoid scaling before softmax
|
193 |
+
self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)
|
194 |
+
self.k = nn.Linear(self.d_model, self.kv_inner_dim, bias=False)
|
195 |
+
self.v = nn.Linear(self.d_model, self.kv_inner_dim, bias=False)
|
196 |
+
self.o = nn.Linear(self.inner_dim, self.d_model, bias=False)
|
197 |
+
|
198 |
+
self.pruned_heads = set()
|
199 |
+
self.gradient_checkpointing = False
|
200 |
+
|
201 |
+
def forward(
|
202 |
+
self,
|
203 |
+
hidden_states: torch.Tensor,
|
204 |
+
key_value_states=None,
|
205 |
+
position_bias=None,
|
206 |
+
mask: Optional[torch.Tensor] = None,
|
207 |
+
layer_head_mask=None,
|
208 |
+
position_ids: Optional[torch.LongTensor] = None,
|
209 |
+
past_key_value: Optional[Tuple[torch.Tensor]] = None,
|
210 |
+
output_attentions: bool = False,
|
211 |
+
use_cache: bool = False,
|
212 |
+
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
213 |
+
assert key_value_states is None
|
214 |
+
assert position_bias is None
|
215 |
+
assert layer_head_mask is None
|
216 |
+
|
217 |
+
bsz, q_len, _ = hidden_states.size()
|
218 |
+
|
219 |
+
query_states = self.q(hidden_states)
|
220 |
+
key_states = self.k(hidden_states)
|
221 |
+
value_states = self.v(hidden_states)
|
222 |
+
|
223 |
+
query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
|
224 |
+
key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
|
225 |
+
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
|
226 |
+
|
227 |
+
kv_seq_len = key_states.shape[-2]
|
228 |
+
if past_key_value is not None:
|
229 |
+
kv_seq_len += past_key_value[0].shape[-2]
|
230 |
+
cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
|
231 |
+
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
|
232 |
+
|
233 |
+
if past_key_value is not None:
|
234 |
+
# reuse k, v, self_attention
|
235 |
+
key_states = torch.cat([past_key_value[0], key_states], dim=2)
|
236 |
+
value_states = torch.cat([past_key_value[1], value_states], dim=2)
|
237 |
+
|
238 |
+
past_key_value = (key_states, value_states) if use_cache else None
|
239 |
+
|
240 |
+
key_states = repeat_kv(key_states, self.num_key_value_groups)
|
241 |
+
value_states = repeat_kv(value_states, self.num_key_value_groups)
|
242 |
+
|
243 |
+
attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
|
244 |
+
|
245 |
+
if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
|
246 |
+
raise ValueError(
|
247 |
+
f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
|
248 |
+
f" {attn_weights.size()}"
|
249 |
+
)
|
250 |
+
|
251 |
+
if mask is not None:
|
252 |
+
if mask.size() != (bsz, 1, q_len, kv_seq_len):
|
253 |
+
raise ValueError(
|
254 |
+
f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {mask.size()}"
|
255 |
+
)
|
256 |
+
attn_weights = attn_weights + mask
|
257 |
+
|
258 |
+
# upcast attention to fp32
|
259 |
+
attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
|
260 |
+
attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout)
|
261 |
+
attn_output = torch.matmul(attn_weights, value_states)
|
262 |
+
|
263 |
+
if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
|
264 |
+
raise ValueError(
|
265 |
+
f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
|
266 |
+
f" {attn_output.size()}"
|
267 |
+
)
|
268 |
+
|
269 |
+
attn_output = attn_output.transpose(1, 2).contiguous()
|
270 |
+
attn_output = attn_output.reshape(bsz, q_len, self.inner_dim)
|
271 |
+
attn_output = self.o(attn_output)
|
272 |
+
|
273 |
+
present_key_value_state = (
|
274 |
+
(key_states, value_states) if (self.is_decoder and use_cache) else None
|
275 |
+
)
|
276 |
+
outputs = (attn_output,) + (present_key_value_state,) + (position_bias,)
|
277 |
+
|
278 |
+
if output_attentions:
|
279 |
+
outputs = outputs + (attn_weights,)
|
280 |
+
return outputs
|
281 |
+
|
282 |
+
|
283 |
+
class DecoderOnlyT5LayerSelfAttention(modeling_t5.T5LayerSelfAttention):
|
284 |
+
def __init__(self, config, has_relative_attention_bias=False):
|
285 |
+
super(modeling_t5.T5LayerSelfAttention, self).__init__()
|
286 |
+
self.SelfAttention = DecoderOnlyT5Attention(
|
287 |
+
config, has_relative_attention_bias=has_relative_attention_bias
|
288 |
+
)
|
289 |
+
self.layer_norm = DecoderOnlyT5LayerNorm(
|
290 |
+
config.d_model,
|
291 |
+
eps=config.layer_norm_epsilon,
|
292 |
+
use_scale=True,
|
293 |
+
center_scale_at_zero=True,
|
294 |
+
)
|
295 |
+
self.dropout = nn.Dropout(config.dropout_rate)
|
296 |
+
self.parallel_layers = config.parallel_layers
|
297 |
+
|
298 |
+
def forward(
|
299 |
+
self,
|
300 |
+
hidden_states,
|
301 |
+
attention_mask=None,
|
302 |
+
position_bias=None,
|
303 |
+
position_ids=None,
|
304 |
+
layer_head_mask=None,
|
305 |
+
past_key_value=None,
|
306 |
+
use_cache=False,
|
307 |
+
output_attentions=False,
|
308 |
+
):
|
309 |
+
if not self.parallel_layers:
|
310 |
+
x = self.layer_norm(hidden_states)
|
311 |
+
else:
|
312 |
+
x = hidden_states
|
313 |
+
attention_output = self.SelfAttention(
|
314 |
+
x,
|
315 |
+
mask=attention_mask,
|
316 |
+
position_bias=position_bias,
|
317 |
+
position_ids=position_ids,
|
318 |
+
layer_head_mask=layer_head_mask,
|
319 |
+
past_key_value=past_key_value,
|
320 |
+
use_cache=use_cache,
|
321 |
+
output_attentions=output_attentions,
|
322 |
+
)
|
323 |
+
if not self.parallel_layers:
|
324 |
+
# When parallel_layers is True, the residual connection is applied
|
325 |
+
# in the decoder block instead of here.
|
326 |
+
hidden_states = hidden_states + self.dropout(attention_output[0])
|
327 |
+
else:
|
328 |
+
hidden_states = attention_output[0]
|
329 |
+
outputs = (hidden_states,) + attention_output[
|
330 |
+
1:
|
331 |
+
] # add attentions if we output them
|
332 |
+
return outputs
|
333 |
+
|
334 |
+
|
335 |
+
class DecoderOnlyT5Block(modeling_t5.T5Block):
|
336 |
+
def __init__(self, config, has_relative_attention_bias=False):
|
337 |
+
super(modeling_t5.T5Block, self).__init__()
|
338 |
+
self.is_decoder = config.is_decoder
|
339 |
+
self.is_decoder_only = config.is_decoder_only
|
340 |
+
self.layer = nn.ModuleList()
|
341 |
+
self.layer.append(
|
342 |
+
DecoderOnlyT5LayerSelfAttention(
|
343 |
+
config, has_relative_attention_bias=has_relative_attention_bias
|
344 |
+
)
|
345 |
+
)
|
346 |
+
if self.is_decoder:
|
347 |
+
if config.is_decoder_only:
|
348 |
+
self.layer.append(nn.Identity())
|
349 |
+
else:
|
350 |
+
self.layer.append(modeling_t5.T5LayerCrossAttention(config))
|
351 |
+
self.parallel_layers = config.parallel_layers
|
352 |
+
self.layer.append(DecoderOnlyT5LayerFF(config))
|
353 |
+
|
354 |
+
def forward(
|
355 |
+
self,
|
356 |
+
hidden_states,
|
357 |
+
attention_mask=None,
|
358 |
+
position_bias=None,
|
359 |
+
position_ids=None,
|
360 |
+
encoder_hidden_states=None,
|
361 |
+
layer_head_mask=None,
|
362 |
+
past_key_value=None,
|
363 |
+
use_cache=False,
|
364 |
+
output_attentions=False,
|
365 |
+
encoder_attention_mask=None,
|
366 |
+
encoder_decoder_position_bias=None,
|
367 |
+
cross_attn_layer_head_mask=None,
|
368 |
+
return_dict=True,
|
369 |
+
):
|
370 |
+
assert encoder_attention_mask is None
|
371 |
+
assert encoder_decoder_position_bias is None
|
372 |
+
assert cross_attn_layer_head_mask is None
|
373 |
+
if past_key_value is not None:
|
374 |
+
expected_num_past_key_values = 2 if encoder_hidden_states is None else 4
|
375 |
+
|
376 |
+
if len(past_key_value) != expected_num_past_key_values:
|
377 |
+
raise ValueError(
|
378 |
+
f"There should be {expected_num_past_key_values} past states. "
|
379 |
+
f"{'2 (past / key) for cross attention. ' if expected_num_past_key_values == 4 else ''}"
|
380 |
+
f"Got {len(past_key_value)} past key / value states"
|
381 |
+
)
|
382 |
+
self_attn_past_key_value = past_key_value[:2]
|
383 |
+
else:
|
384 |
+
self_attn_past_key_value = None
|
385 |
+
|
386 |
+
ff_layer = self.layer[-1]
|
387 |
+
if self.parallel_layers:
|
388 |
+
# https://github.com/google/flaxformer/blob/ea17eb012a1d340ddff017b7a534c2162aaec34c/flaxformer/architectures/t5/t5_architecture.py#L563-L568
|
389 |
+
x = self.layer[0].layer_norm(hidden_states)
|
390 |
+
ff_output = ff_layer(x)
|
391 |
+
else:
|
392 |
+
x = hidden_states
|
393 |
+
|
394 |
+
self_attention_outputs = self.layer[0](
|
395 |
+
x,
|
396 |
+
attention_mask=attention_mask,
|
397 |
+
position_bias=position_bias,
|
398 |
+
position_ids=position_ids,
|
399 |
+
layer_head_mask=layer_head_mask,
|
400 |
+
past_key_value=self_attn_past_key_value,
|
401 |
+
use_cache=use_cache,
|
402 |
+
output_attentions=output_attentions,
|
403 |
+
)
|
404 |
+
x, present_key_value_state = self_attention_outputs[:2]
|
405 |
+
attention_outputs = self_attention_outputs[
|
406 |
+
2:
|
407 |
+
] # Keep self-attention outputs and relative position weights
|
408 |
+
|
409 |
+
# clamp inf values to enable fp16 training
|
410 |
+
if x.dtype == torch.float16:
|
411 |
+
clamp_value = torch.where(
|
412 |
+
torch.isinf(x).any(),
|
413 |
+
torch.finfo(x.dtype).max - 1000,
|
414 |
+
torch.finfo(x.dtype).max,
|
415 |
+
)
|
416 |
+
x = torch.clamp(x, min=-clamp_value, max=clamp_value)
|
417 |
+
|
418 |
+
do_cross_attention = (
|
419 |
+
self.is_decoder
|
420 |
+
and not self.is_decoder_only
|
421 |
+
and encoder_hidden_states is not None
|
422 |
+
)
|
423 |
+
assert not do_cross_attention
|
424 |
+
|
425 |
+
if self.parallel_layers:
|
426 |
+
# https://github.com/google/flaxformer/blob/ea17eb012a1d340ddff017b7a534c2162aaec34c/flaxformer/architectures/t5/t5_architecture.py#L534-L578
|
427 |
+
x = x + ff_output
|
428 |
+
x *= 2**-0.5
|
429 |
+
hidden_states = hidden_states + self.layer[0].dropout(x)
|
430 |
+
else:
|
431 |
+
hidden_states = ff_layer(x)
|
432 |
+
|
433 |
+
# clamp inf values to enable fp16 training
|
434 |
+
if hidden_states.dtype == torch.float16:
|
435 |
+
clamp_value = torch.where(
|
436 |
+
torch.isinf(hidden_states).any(),
|
437 |
+
torch.finfo(hidden_states.dtype).max - 1000,
|
438 |
+
torch.finfo(hidden_states.dtype).max,
|
439 |
+
)
|
440 |
+
hidden_states = torch.clamp(
|
441 |
+
hidden_states, min=-clamp_value, max=clamp_value
|
442 |
+
)
|
443 |
+
|
444 |
+
outputs = (hidden_states,)
|
445 |
+
|
446 |
+
if use_cache:
|
447 |
+
outputs = outputs + (present_key_value_state,) + attention_outputs
|
448 |
+
else:
|
449 |
+
outputs = outputs + attention_outputs
|
450 |
+
|
451 |
+
return outputs # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
|
452 |
+
|
453 |
+
|
454 |
+
class DecoderOnlyT5Stack(modeling_t5.T5Stack):
|
455 |
+
def __init__(self, config, embed_tokens=None):
|
456 |
+
super(modeling_t5.T5Stack, self).__init__(config)
|
457 |
+
|
458 |
+
self.embed_tokens = embed_tokens
|
459 |
+
self.is_decoder = config.is_decoder
|
460 |
+
|
461 |
+
self.block = nn.ModuleList(
|
462 |
+
[
|
463 |
+
DecoderOnlyT5Block(
|
464 |
+
config,
|
465 |
+
has_relative_attention_bias=(
|
466 |
+
config.has_relative_attention_bias and bool(i == 0)
|
467 |
+
),
|
468 |
+
)
|
469 |
+
for i in range(config.num_layers)
|
470 |
+
]
|
471 |
+
)
|
472 |
+
self.final_layer_norm = DecoderOnlyT5LayerNorm(
|
473 |
+
config.d_model,
|
474 |
+
eps=config.layer_norm_epsilon,
|
475 |
+
use_scale=False,
|
476 |
+
center_scale_at_zero=False,
|
477 |
+
)
|
478 |
+
self.dropout = nn.Dropout(config.dropout_rate)
|
479 |
+
|
480 |
+
# Initialize weights and apply final processing
|
481 |
+
self.post_init()
|
482 |
+
# Model parallel
|
483 |
+
self.model_parallel = False
|
484 |
+
self.device_map = None
|
485 |
+
self.gradient_checkpointing = False
|
486 |
+
|
487 |
+
def forward(
|
488 |
+
self,
|
489 |
+
input_ids=None,
|
490 |
+
position_ids=None,
|
491 |
+
attention_mask=None,
|
492 |
+
encoder_hidden_states=None,
|
493 |
+
encoder_attention_mask=None,
|
494 |
+
inputs_embeds=None,
|
495 |
+
head_mask=None,
|
496 |
+
cross_attn_head_mask=None,
|
497 |
+
past_key_values=None,
|
498 |
+
use_cache=None,
|
499 |
+
output_attentions=None,
|
500 |
+
output_hidden_states=None,
|
501 |
+
return_dict=None,
|
502 |
+
):
|
503 |
+
# Model parallel
|
504 |
+
if self.model_parallel:
|
505 |
+
torch.cuda.set_device(self.first_device)
|
506 |
+
self.embed_tokens = self.embed_tokens.to(self.first_device)
|
507 |
+
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
508 |
+
output_attentions = (
|
509 |
+
output_attentions
|
510 |
+
if output_attentions is not None
|
511 |
+
else self.config.output_attentions
|
512 |
+
)
|
513 |
+
output_hidden_states = (
|
514 |
+
output_hidden_states
|
515 |
+
if output_hidden_states is not None
|
516 |
+
else self.config.output_hidden_states
|
517 |
+
)
|
518 |
+
return_dict = (
|
519 |
+
return_dict if return_dict is not None else self.config.use_return_dict
|
520 |
+
)
|
521 |
+
|
522 |
+
if input_ids is not None and inputs_embeds is not None:
|
523 |
+
err_msg_prefix = "decoder_" if self.is_decoder else ""
|
524 |
+
raise ValueError(
|
525 |
+
f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time"
|
526 |
+
)
|
527 |
+
elif input_ids is not None:
|
528 |
+
input_shape = input_ids.size()
|
529 |
+
input_ids = input_ids.view(-1, input_shape[-1])
|
530 |
+
elif inputs_embeds is not None:
|
531 |
+
input_shape = inputs_embeds.size()[:-1]
|
532 |
+
else:
|
533 |
+
err_msg_prefix = "decoder_" if self.is_decoder else ""
|
534 |
+
raise ValueError(
|
535 |
+
f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds"
|
536 |
+
)
|
537 |
+
|
538 |
+
if position_ids is None:
|
539 |
+
seq_length = input_ids.shape[1]
|
540 |
+
past_key_values_length = (
|
541 |
+
0 if past_key_values is None else past_key_values[0][0].shape[2]
|
542 |
+
)
|
543 |
+
device = input_ids.device if input_ids is not None else inputs_embeds.device
|
544 |
+
position_ids = torch.arange(
|
545 |
+
past_key_values_length,
|
546 |
+
seq_length + past_key_values_length,
|
547 |
+
dtype=torch.long,
|
548 |
+
device=device,
|
549 |
+
).unsqueeze(0)
|
550 |
+
|
551 |
+
if inputs_embeds is None:
|
552 |
+
if self.embed_tokens is None:
|
553 |
+
raise ValueError(
|
554 |
+
"You have to initialize the model with valid token embeddings"
|
555 |
+
)
|
556 |
+
inputs_embeds = self.embed_tokens(input_ids)
|
557 |
+
|
558 |
+
batch_size, seq_length = input_shape
|
559 |
+
|
560 |
+
# required mask seq length can be calculated via length of past
|
561 |
+
mask_seq_length = (
|
562 |
+
past_key_values[0][0].shape[2] + seq_length
|
563 |
+
if past_key_values is not None
|
564 |
+
else seq_length
|
565 |
+
)
|
566 |
+
|
567 |
+
if use_cache is True:
|
568 |
+
if not self.is_decoder:
|
569 |
+
raise ValueError(
|
570 |
+
f"`use_cache` can only be set to `True` if {self} is used as a decoder"
|
571 |
+
)
|
572 |
+
|
573 |
+
if attention_mask is None:
|
574 |
+
attention_mask = torch.ones(
|
575 |
+
batch_size, mask_seq_length, device=inputs_embeds.device
|
576 |
+
)
|
577 |
+
|
578 |
+
# initialize past_key_values with `None` if past does not exist
|
579 |
+
if past_key_values is None:
|
580 |
+
past_key_values = [None] * len(self.block)
|
581 |
+
|
582 |
+
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
|
583 |
+
# ourselves in which case we just need to make it broadcastable to all heads.
|
584 |
+
extended_attention_mask = self.get_extended_attention_mask(
|
585 |
+
attention_mask, input_shape
|
586 |
+
)
|
587 |
+
|
588 |
+
if self.gradient_checkpointing and self.training:
|
589 |
+
if use_cache:
|
590 |
+
logger.warning_once(
|
591 |
+
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
|
592 |
+
)
|
593 |
+
use_cache = False
|
594 |
+
|
595 |
+
# Prepare head mask if needed
|
596 |
+
head_mask = self.get_head_mask(head_mask, self.config.num_layers)
|
597 |
+
cross_attn_head_mask = self.get_head_mask(
|
598 |
+
cross_attn_head_mask, self.config.num_layers
|
599 |
+
)
|
600 |
+
present_key_value_states = () if use_cache else None
|
601 |
+
all_hidden_states = () if output_hidden_states else None
|
602 |
+
all_attentions = () if output_attentions else None
|
603 |
+
all_cross_attentions = () if (output_attentions and self.is_decoder) else None
|
604 |
+
position_bias = None
|
605 |
+
|
606 |
+
hidden_states = self.dropout(inputs_embeds)
|
607 |
+
|
608 |
+
for i, (layer_module, past_key_value) in enumerate(
|
609 |
+
zip(self.block, past_key_values)
|
610 |
+
):
|
611 |
+
layer_head_mask = head_mask[i]
|
612 |
+
cross_attn_layer_head_mask = cross_attn_head_mask[i]
|
613 |
+
# Model parallel
|
614 |
+
if self.model_parallel:
|
615 |
+
torch.cuda.set_device(hidden_states.device)
|
616 |
+
# Ensure that attention_mask is always on the same device as hidden_states
|
617 |
+
if attention_mask is not None:
|
618 |
+
attention_mask = attention_mask.to(hidden_states.device)
|
619 |
+
if position_bias is not None:
|
620 |
+
position_bias = position_bias.to(hidden_states.device)
|
621 |
+
if layer_head_mask is not None:
|
622 |
+
layer_head_mask = layer_head_mask.to(hidden_states.device)
|
623 |
+
|
624 |
+
if output_hidden_states:
|
625 |
+
all_hidden_states = all_hidden_states + (hidden_states,)
|
626 |
+
|
627 |
+
if self.gradient_checkpointing and self.training:
|
628 |
+
layer_outputs = self._gradient_checkpointing_func(
|
629 |
+
layer_module.forward,
|
630 |
+
hidden_states,
|
631 |
+
extended_attention_mask,
|
632 |
+
position_bias,
|
633 |
+
None,
|
634 |
+
None,
|
635 |
+
None,
|
636 |
+
layer_head_mask,
|
637 |
+
cross_attn_layer_head_mask,
|
638 |
+
None, # past_key_value is always None with gradient checkpointing
|
639 |
+
use_cache,
|
640 |
+
output_attentions,
|
641 |
+
)
|
642 |
+
else:
|
643 |
+
layer_outputs = layer_module(
|
644 |
+
hidden_states,
|
645 |
+
attention_mask=extended_attention_mask,
|
646 |
+
position_bias=position_bias,
|
647 |
+
position_ids=position_ids,
|
648 |
+
encoder_hidden_states=None,
|
649 |
+
encoder_attention_mask=None,
|
650 |
+
encoder_decoder_position_bias=None,
|
651 |
+
layer_head_mask=layer_head_mask,
|
652 |
+
cross_attn_layer_head_mask=cross_attn_layer_head_mask,
|
653 |
+
past_key_value=past_key_value,
|
654 |
+
use_cache=use_cache,
|
655 |
+
output_attentions=output_attentions,
|
656 |
+
)
|
657 |
+
|
658 |
+
# layer_outputs is a tuple with:
|
659 |
+
# hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
|
660 |
+
if use_cache is False:
|
661 |
+
layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:]
|
662 |
+
|
663 |
+
hidden_states, present_key_value_state = layer_outputs[:2]
|
664 |
+
|
665 |
+
# We share the position biases between the layers - the first layer store them
|
666 |
+
# layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights),
|
667 |
+
# (cross-attention position bias), (cross-attention weights)
|
668 |
+
position_bias = layer_outputs[2]
|
669 |
+
# append next layer key value states
|
670 |
+
if use_cache:
|
671 |
+
present_key_value_states = present_key_value_states + (
|
672 |
+
present_key_value_state,
|
673 |
+
)
|
674 |
+
|
675 |
+
if output_attentions:
|
676 |
+
all_attentions = all_attentions + (layer_outputs[3],)
|
677 |
+
if self.is_decoder:
|
678 |
+
all_cross_attentions = all_cross_attentions + (layer_outputs[5],)
|
679 |
+
|
680 |
+
# Model Parallel: If it's the last layer for that device, put things on the next device
|
681 |
+
if self.model_parallel:
|
682 |
+
for k, v in self.device_map.items():
|
683 |
+
if i == v[-1] and "cuda:" + str(k) != self.last_device:
|
684 |
+
hidden_states = hidden_states.to("cuda:" + str(k + 1))
|
685 |
+
|
686 |
+
hidden_states = self.final_layer_norm(hidden_states)
|
687 |
+
hidden_states = self.dropout(hidden_states)
|
688 |
+
|
689 |
+
# Add last layer
|
690 |
+
if output_hidden_states:
|
691 |
+
all_hidden_states = all_hidden_states + (hidden_states,)
|
692 |
+
|
693 |
+
if not return_dict:
|
694 |
+
return tuple(
|
695 |
+
v
|
696 |
+
for v in [
|
697 |
+
hidden_states,
|
698 |
+
present_key_value_states,
|
699 |
+
all_hidden_states,
|
700 |
+
all_attentions,
|
701 |
+
all_cross_attentions,
|
702 |
+
]
|
703 |
+
if v is not None
|
704 |
+
)
|
705 |
+
return modeling_t5.BaseModelOutputWithPastAndCrossAttentions(
|
706 |
+
last_hidden_state=hidden_states,
|
707 |
+
past_key_values=present_key_value_states,
|
708 |
+
hidden_states=all_hidden_states,
|
709 |
+
attentions=all_attentions,
|
710 |
+
cross_attentions=all_cross_attentions,
|
711 |
+
)
|
712 |
+
|
713 |
+
|
714 |
+
class DecoderOnlyT5Model(modeling_t5.T5ForConditionalGeneration):
|
715 |
+
def __init__(self, config: DecoderOnlyT5Config):
|
716 |
+
super(modeling_t5.T5ForConditionalGeneration, self).__init__(config)
|
717 |
+
self.model_dim = config.d_model
|
718 |
+
|
719 |
+
self.shared = nn.Embedding(config.vocab_size, config.d_model)
|
720 |
+
assert (
|
721 |
+
self.config.num_layers == 0
|
722 |
+
), "Decoder only model cannot have encoder layers"
|
723 |
+
self.encoder = None
|
724 |
+
|
725 |
+
decoder_config = copy.deepcopy(config)
|
726 |
+
decoder_config.is_decoder = True
|
727 |
+
decoder_config.is_encoder_decoder = False
|
728 |
+
decoder_config.num_layers = config.num_decoder_layers
|
729 |
+
self.decoder = DecoderOnlyT5Stack(decoder_config, self.shared)
|
730 |
+
|
731 |
+
self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
|
732 |
+
|
733 |
+
# Initialize weights and apply final processing
|
734 |
+
self.post_init()
|
735 |
+
|
736 |
+
# Model parallel
|
737 |
+
self.model_parallel = False
|
738 |
+
self.device_map = None
|
739 |
+
|
740 |
+
def _tie_weights(self):
|
741 |
+
if not self.config.tie_word_embeddings:
|
742 |
+
return
|
743 |
+
if self.decoder:
|
744 |
+
self._tie_or_clone_weights(self.decoder.embed_tokens, self.shared)
|
745 |
+
|
746 |
+
@add_start_docstrings_to_model_forward(modeling_t5.T5_INPUTS_DOCSTRING)
|
747 |
+
@replace_return_docstrings(
|
748 |
+
output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
|
749 |
+
)
|
750 |
+
def forward(
|
751 |
+
self,
|
752 |
+
input_ids: Optional[torch.LongTensor] = None,
|
753 |
+
position_ids: Optional[torch.LongTensor] = None,
|
754 |
+
attention_mask: Optional[torch.FloatTensor] = None,
|
755 |
+
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
|
756 |
+
inputs_embeds: Optional[torch.FloatTensor] = None,
|
757 |
+
labels: Optional[torch.LongTensor] = None,
|
758 |
+
use_cache: Optional[bool] = None,
|
759 |
+
output_attentions: Optional[bool] = None,
|
760 |
+
output_hidden_states: Optional[bool] = None,
|
761 |
+
return_dict: Optional[bool] = None,
|
762 |
+
) -> Union[Tuple, CausalLMOutputWithPast]:
|
763 |
+
r"""
|
764 |
+
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
765 |
+
Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
|
766 |
+
config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
|
767 |
+
labels in `[0, ..., config.vocab_size]`
|
768 |
+
|
769 |
+
Returns:
|
770 |
+
|
771 |
+
Examples:
|
772 |
+
|
773 |
+
```"""
|
774 |
+
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
775 |
+
return_dict = (
|
776 |
+
return_dict if return_dict is not None else self.config.use_return_dict
|
777 |
+
)
|
778 |
+
|
779 |
+
if self.model_parallel:
|
780 |
+
torch.cuda.set_device(self.decoder.first_device)
|
781 |
+
|
782 |
+
# Set device for model parallelism
|
783 |
+
if self.model_parallel:
|
784 |
+
torch.cuda.set_device(self.decoder.first_device)
|
785 |
+
if input_ids is not None:
|
786 |
+
input_ids = input_ids.to(self.decoder.first_device)
|
787 |
+
if attention_mask is not None:
|
788 |
+
attention_mask = attention_mask.to(self.decoder.first_device)
|
789 |
+
|
790 |
+
# Decode
|
791 |
+
outputs = self.decoder(
|
792 |
+
input_ids=input_ids,
|
793 |
+
position_ids=position_ids,
|
794 |
+
attention_mask=attention_mask,
|
795 |
+
inputs_embeds=inputs_embeds,
|
796 |
+
past_key_values=past_key_values,
|
797 |
+
encoder_hidden_states=None,
|
798 |
+
encoder_attention_mask=None,
|
799 |
+
head_mask=None,
|
800 |
+
cross_attn_head_mask=None,
|
801 |
+
use_cache=use_cache,
|
802 |
+
output_attentions=output_attentions,
|
803 |
+
output_hidden_states=output_hidden_states,
|
804 |
+
return_dict=return_dict,
|
805 |
+
)
|
806 |
+
|
807 |
+
sequence_output = outputs[0]
|
808 |
+
|
809 |
+
# Set device for model parallelism
|
810 |
+
if self.model_parallel:
|
811 |
+
torch.cuda.set_device(self.decoder.first_device)
|
812 |
+
self.lm_head = self.lm_head.to(self.decoder.first_device)
|
813 |
+
sequence_output = sequence_output.to(self.lm_head.weight.device)
|
814 |
+
|
815 |
+
if self.config.tie_word_embeddings:
|
816 |
+
# Rescale output before projecting on vocab
|
817 |
+
# See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
|
818 |
+
sequence_output = sequence_output * (self.model_dim**-0.5)
|
819 |
+
|
820 |
+
lm_logits = self.lm_head(sequence_output)
|
821 |
+
|
822 |
+
loss = None
|
823 |
+
if labels is not None:
|
824 |
+
loss_fct = CrossEntropyLoss(ignore_index=-100)
|
825 |
+
# move labels to correct device to enable PP
|
826 |
+
labels = labels.to(lm_logits.device)
|
827 |
+
loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
|
828 |
+
# TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
|
829 |
+
|
830 |
+
if not return_dict:
|
831 |
+
output = (lm_logits,) + outputs[1:]
|
832 |
+
return ((loss,) + output) if loss is not None else output
|
833 |
+
|
834 |
+
return CausalLMOutputWithPast(
|
835 |
+
loss=loss,
|
836 |
+
logits=lm_logits,
|
837 |
+
past_key_values=outputs.past_key_values,
|
838 |
+
hidden_states=outputs.hidden_states,
|
839 |
+
attentions=outputs.attentions,
|
840 |
+
)
|
model-00000-of-00007.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:eee3b0c4eef668152f9f1106f18bf0a892bd04ba8b26017d7d5865f49dec5f3c
|
3 |
+
size 5150622792
|
model-00001-of-00007.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:29b44a655d9261522e705d963d1fa7ca1717c3e6bcfb9402fa69cb8ee6156c6f
|
3 |
+
size 4739650416
|
model-00002-of-00007.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1dbd16405fa07722d953ba5c99aeb8ae05c1068cb4018a9622e5828336c1b9c8
|
3 |
+
size 4739650424
|
model-00003-of-00007.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7444d7670f145f12706970a191b260b609a82626c5e417f08f1039c05fbdda75
|
3 |
+
size 4739650456
|
model-00004-of-00007.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:471e481023c3f4622d2ff0b1031a34c55469fb10fe1252f5c5c62e8f95418b4b
|
3 |
+
size 4739650456
|
model-00005-of-00007.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e934d8fcf5bc54649e82b29c1322b7ce3d13e8915d2cac205e65660e0a4cdbbb
|
3 |
+
size 4739650456
|
model-00006-of-00007.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f3f82de95453b49cb5a78ec517fac3219556f83edd9075a20a7ac95a577b5e93
|
3 |
+
size 4739650456
|
model-00007-of-00007.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f816e875186339eaf9afa8964d34952c80e697ade369154512f9900ea0a33553
|
3 |
+
size 947930104
|
model.safetensors.index.json
ADDED
@@ -0,0 +1,262 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"metadata": {},
|
3 |
+
"weight_map": {
|
4 |
+
"shared.weight": "model-00000-of-00007.safetensors",
|
5 |
+
"decoder.block.0.layer.0.layer_norm.weight": "model-00000-of-00007.safetensors",
|
6 |
+
"decoder.block.0.layer.0.SelfAttention.k.weight": "model-00000-of-00007.safetensors",
|
7 |
+
"decoder.block.0.layer.0.SelfAttention.o.weight": "model-00000-of-00007.safetensors",
|
8 |
+
"decoder.block.0.layer.0.SelfAttention.q.weight": "model-00000-of-00007.safetensors",
|
9 |
+
"decoder.block.0.layer.0.SelfAttention.v.weight": "model-00000-of-00007.safetensors",
|
10 |
+
"decoder.block.0.layer.2.DenseReluDense.wi_0.weight": "model-00000-of-00007.safetensors",
|
11 |
+
"decoder.block.0.layer.2.DenseReluDense.wi_1.weight": "model-00000-of-00007.safetensors",
|
12 |
+
"decoder.block.0.layer.2.DenseReluDense.wo.weight": "model-00000-of-00007.safetensors",
|
13 |
+
"decoder.block.1.layer.0.layer_norm.weight": "model-00001-of-00007.safetensors",
|
14 |
+
"decoder.block.1.layer.0.SelfAttention.k.weight": "model-00001-of-00007.safetensors",
|
15 |
+
"decoder.block.1.layer.0.SelfAttention.o.weight": "model-00001-of-00007.safetensors",
|
16 |
+
"decoder.block.1.layer.0.SelfAttention.q.weight": "model-00001-of-00007.safetensors",
|
17 |
+
"decoder.block.1.layer.0.SelfAttention.v.weight": "model-00001-of-00007.safetensors",
|
18 |
+
"decoder.block.1.layer.2.DenseReluDense.wi_0.weight": "model-00001-of-00007.safetensors",
|
19 |
+
"decoder.block.1.layer.2.DenseReluDense.wi_1.weight": "model-00001-of-00007.safetensors",
|
20 |
+
"decoder.block.1.layer.2.DenseReluDense.wo.weight": "model-00001-of-00007.safetensors",
|
21 |
+
"decoder.block.2.layer.0.layer_norm.weight": "model-00001-of-00007.safetensors",
|
22 |
+
"decoder.block.2.layer.0.SelfAttention.k.weight": "model-00001-of-00007.safetensors",
|
23 |
+
"decoder.block.2.layer.0.SelfAttention.o.weight": "model-00001-of-00007.safetensors",
|
24 |
+
"decoder.block.2.layer.0.SelfAttention.q.weight": "model-00001-of-00007.safetensors",
|
25 |
+
"decoder.block.2.layer.0.SelfAttention.v.weight": "model-00001-of-00007.safetensors",
|
26 |
+
"decoder.block.2.layer.2.DenseReluDense.wi_0.weight": "model-00001-of-00007.safetensors",
|
27 |
+
"decoder.block.2.layer.2.DenseReluDense.wi_1.weight": "model-00001-of-00007.safetensors",
|
28 |
+
"decoder.block.2.layer.2.DenseReluDense.wo.weight": "model-00001-of-00007.safetensors",
|
29 |
+
"decoder.block.3.layer.0.layer_norm.weight": "model-00001-of-00007.safetensors",
|
30 |
+
"decoder.block.3.layer.0.SelfAttention.k.weight": "model-00001-of-00007.safetensors",
|
31 |
+
"decoder.block.3.layer.0.SelfAttention.o.weight": "model-00001-of-00007.safetensors",
|
32 |
+
"decoder.block.3.layer.0.SelfAttention.q.weight": "model-00001-of-00007.safetensors",
|
33 |
+
"decoder.block.3.layer.0.SelfAttention.v.weight": "model-00001-of-00007.safetensors",
|
34 |
+
"decoder.block.3.layer.2.DenseReluDense.wi_0.weight": "model-00001-of-00007.safetensors",
|
35 |
+
"decoder.block.3.layer.2.DenseReluDense.wi_1.weight": "model-00001-of-00007.safetensors",
|
36 |
+
"decoder.block.3.layer.2.DenseReluDense.wo.weight": "model-00001-of-00007.safetensors",
|
37 |
+
"decoder.block.4.layer.0.layer_norm.weight": "model-00001-of-00007.safetensors",
|
38 |
+
"decoder.block.4.layer.0.SelfAttention.k.weight": "model-00001-of-00007.safetensors",
|
39 |
+
"decoder.block.4.layer.0.SelfAttention.o.weight": "model-00001-of-00007.safetensors",
|
40 |
+
"decoder.block.4.layer.0.SelfAttention.q.weight": "model-00001-of-00007.safetensors",
|
41 |
+
"decoder.block.4.layer.0.SelfAttention.v.weight": "model-00001-of-00007.safetensors",
|
42 |
+
"decoder.block.4.layer.2.DenseReluDense.wi_0.weight": "model-00001-of-00007.safetensors",
|
43 |
+
"decoder.block.4.layer.2.DenseReluDense.wi_1.weight": "model-00001-of-00007.safetensors",
|
44 |
+
"decoder.block.4.layer.2.DenseReluDense.wo.weight": "model-00001-of-00007.safetensors",
|
45 |
+
"decoder.block.5.layer.0.layer_norm.weight": "model-00001-of-00007.safetensors",
|
46 |
+
"decoder.block.5.layer.0.SelfAttention.k.weight": "model-00001-of-00007.safetensors",
|
47 |
+
"decoder.block.5.layer.0.SelfAttention.o.weight": "model-00001-of-00007.safetensors",
|
48 |
+
"decoder.block.5.layer.0.SelfAttention.q.weight": "model-00001-of-00007.safetensors",
|
49 |
+
"decoder.block.5.layer.0.SelfAttention.v.weight": "model-00001-of-00007.safetensors",
|
50 |
+
"decoder.block.5.layer.2.DenseReluDense.wi_0.weight": "model-00001-of-00007.safetensors",
|
51 |
+
"decoder.block.5.layer.2.DenseReluDense.wi_1.weight": "model-00001-of-00007.safetensors",
|
52 |
+
"decoder.block.5.layer.2.DenseReluDense.wo.weight": "model-00001-of-00007.safetensors",
|
53 |
+
"decoder.block.6.layer.0.layer_norm.weight": "model-00002-of-00007.safetensors",
|
54 |
+
"decoder.block.6.layer.0.SelfAttention.k.weight": "model-00002-of-00007.safetensors",
|
55 |
+
"decoder.block.6.layer.0.SelfAttention.o.weight": "model-00002-of-00007.safetensors",
|
56 |
+
"decoder.block.6.layer.0.SelfAttention.q.weight": "model-00002-of-00007.safetensors",
|
57 |
+
"decoder.block.6.layer.0.SelfAttention.v.weight": "model-00002-of-00007.safetensors",
|
58 |
+
"decoder.block.6.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00007.safetensors",
|
59 |
+
"decoder.block.6.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00007.safetensors",
|
60 |
+
"decoder.block.6.layer.2.DenseReluDense.wo.weight": "model-00002-of-00007.safetensors",
|
61 |
+
"decoder.block.7.layer.0.layer_norm.weight": "model-00002-of-00007.safetensors",
|
62 |
+
"decoder.block.7.layer.0.SelfAttention.k.weight": "model-00002-of-00007.safetensors",
|
63 |
+
"decoder.block.7.layer.0.SelfAttention.o.weight": "model-00002-of-00007.safetensors",
|
64 |
+
"decoder.block.7.layer.0.SelfAttention.q.weight": "model-00002-of-00007.safetensors",
|
65 |
+
"decoder.block.7.layer.0.SelfAttention.v.weight": "model-00002-of-00007.safetensors",
|
66 |
+
"decoder.block.7.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00007.safetensors",
|
67 |
+
"decoder.block.7.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00007.safetensors",
|
68 |
+
"decoder.block.7.layer.2.DenseReluDense.wo.weight": "model-00002-of-00007.safetensors",
|
69 |
+
"decoder.block.8.layer.0.layer_norm.weight": "model-00002-of-00007.safetensors",
|
70 |
+
"decoder.block.8.layer.0.SelfAttention.k.weight": "model-00002-of-00007.safetensors",
|
71 |
+
"decoder.block.8.layer.0.SelfAttention.o.weight": "model-00002-of-00007.safetensors",
|
72 |
+
"decoder.block.8.layer.0.SelfAttention.q.weight": "model-00002-of-00007.safetensors",
|
73 |
+
"decoder.block.8.layer.0.SelfAttention.v.weight": "model-00002-of-00007.safetensors",
|
74 |
+
"decoder.block.8.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00007.safetensors",
|
75 |
+
"decoder.block.8.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00007.safetensors",
|
76 |
+
"decoder.block.8.layer.2.DenseReluDense.wo.weight": "model-00002-of-00007.safetensors",
|
77 |
+
"decoder.block.9.layer.0.layer_norm.weight": "model-00002-of-00007.safetensors",
|
78 |
+
"decoder.block.9.layer.0.SelfAttention.k.weight": "model-00002-of-00007.safetensors",
|
79 |
+
"decoder.block.9.layer.0.SelfAttention.o.weight": "model-00002-of-00007.safetensors",
|
80 |
+
"decoder.block.9.layer.0.SelfAttention.q.weight": "model-00002-of-00007.safetensors",
|
81 |
+
"decoder.block.9.layer.0.SelfAttention.v.weight": "model-00002-of-00007.safetensors",
|
82 |
+
"decoder.block.9.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00007.safetensors",
|
83 |
+
"decoder.block.9.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00007.safetensors",
|
84 |
+
"decoder.block.9.layer.2.DenseReluDense.wo.weight": "model-00002-of-00007.safetensors",
|
85 |
+
"decoder.block.10.layer.0.layer_norm.weight": "model-00002-of-00007.safetensors",
|
86 |
+
"decoder.block.10.layer.0.SelfAttention.k.weight": "model-00002-of-00007.safetensors",
|
87 |
+
"decoder.block.10.layer.0.SelfAttention.o.weight": "model-00002-of-00007.safetensors",
|
88 |
+
"decoder.block.10.layer.0.SelfAttention.q.weight": "model-00002-of-00007.safetensors",
|
89 |
+
"decoder.block.10.layer.0.SelfAttention.v.weight": "model-00002-of-00007.safetensors",
|
90 |
+
"decoder.block.10.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00007.safetensors",
|
91 |
+
"decoder.block.10.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00007.safetensors",
|
92 |
+
"decoder.block.10.layer.2.DenseReluDense.wo.weight": "model-00002-of-00007.safetensors",
|
93 |
+
"decoder.block.11.layer.0.layer_norm.weight": "model-00003-of-00007.safetensors",
|
94 |
+
"decoder.block.11.layer.0.SelfAttention.k.weight": "model-00003-of-00007.safetensors",
|
95 |
+
"decoder.block.11.layer.0.SelfAttention.o.weight": "model-00003-of-00007.safetensors",
|
96 |
+
"decoder.block.11.layer.0.SelfAttention.q.weight": "model-00003-of-00007.safetensors",
|
97 |
+
"decoder.block.11.layer.0.SelfAttention.v.weight": "model-00003-of-00007.safetensors",
|
98 |
+
"decoder.block.11.layer.2.DenseReluDense.wi_0.weight": "model-00003-of-00007.safetensors",
|
99 |
+
"decoder.block.11.layer.2.DenseReluDense.wi_1.weight": "model-00003-of-00007.safetensors",
|
100 |
+
"decoder.block.11.layer.2.DenseReluDense.wo.weight": "model-00003-of-00007.safetensors",
|
101 |
+
"decoder.block.12.layer.0.layer_norm.weight": "model-00003-of-00007.safetensors",
|
102 |
+
"decoder.block.12.layer.0.SelfAttention.k.weight": "model-00003-of-00007.safetensors",
|
103 |
+
"decoder.block.12.layer.0.SelfAttention.o.weight": "model-00003-of-00007.safetensors",
|
104 |
+
"decoder.block.12.layer.0.SelfAttention.q.weight": "model-00003-of-00007.safetensors",
|
105 |
+
"decoder.block.12.layer.0.SelfAttention.v.weight": "model-00003-of-00007.safetensors",
|
106 |
+
"decoder.block.12.layer.2.DenseReluDense.wi_0.weight": "model-00003-of-00007.safetensors",
|
107 |
+
"decoder.block.12.layer.2.DenseReluDense.wi_1.weight": "model-00003-of-00007.safetensors",
|
108 |
+
"decoder.block.12.layer.2.DenseReluDense.wo.weight": "model-00003-of-00007.safetensors",
|
109 |
+
"decoder.block.13.layer.0.layer_norm.weight": "model-00003-of-00007.safetensors",
|
110 |
+
"decoder.block.13.layer.0.SelfAttention.k.weight": "model-00003-of-00007.safetensors",
|
111 |
+
"decoder.block.13.layer.0.SelfAttention.o.weight": "model-00003-of-00007.safetensors",
|
112 |
+
"decoder.block.13.layer.0.SelfAttention.q.weight": "model-00003-of-00007.safetensors",
|
113 |
+
"decoder.block.13.layer.0.SelfAttention.v.weight": "model-00003-of-00007.safetensors",
|
114 |
+
"decoder.block.13.layer.2.DenseReluDense.wi_0.weight": "model-00003-of-00007.safetensors",
|
115 |
+
"decoder.block.13.layer.2.DenseReluDense.wi_1.weight": "model-00003-of-00007.safetensors",
|
116 |
+
"decoder.block.13.layer.2.DenseReluDense.wo.weight": "model-00003-of-00007.safetensors",
|
117 |
+
"decoder.block.14.layer.0.layer_norm.weight": "model-00003-of-00007.safetensors",
|
118 |
+
"decoder.block.14.layer.0.SelfAttention.k.weight": "model-00003-of-00007.safetensors",
|
119 |
+
"decoder.block.14.layer.0.SelfAttention.o.weight": "model-00003-of-00007.safetensors",
|
120 |
+
"decoder.block.14.layer.0.SelfAttention.q.weight": "model-00003-of-00007.safetensors",
|
121 |
+
"decoder.block.14.layer.0.SelfAttention.v.weight": "model-00003-of-00007.safetensors",
|
122 |
+
"decoder.block.14.layer.2.DenseReluDense.wi_0.weight": "model-00003-of-00007.safetensors",
|
123 |
+
"decoder.block.14.layer.2.DenseReluDense.wi_1.weight": "model-00003-of-00007.safetensors",
|
124 |
+
"decoder.block.14.layer.2.DenseReluDense.wo.weight": "model-00003-of-00007.safetensors",
|
125 |
+
"decoder.block.15.layer.0.layer_norm.weight": "model-00003-of-00007.safetensors",
|
126 |
+
"decoder.block.15.layer.0.SelfAttention.k.weight": "model-00003-of-00007.safetensors",
|
127 |
+
"decoder.block.15.layer.0.SelfAttention.o.weight": "model-00003-of-00007.safetensors",
|
128 |
+
"decoder.block.15.layer.0.SelfAttention.q.weight": "model-00003-of-00007.safetensors",
|
129 |
+
"decoder.block.15.layer.0.SelfAttention.v.weight": "model-00003-of-00007.safetensors",
|
130 |
+
"decoder.block.15.layer.2.DenseReluDense.wi_0.weight": "model-00003-of-00007.safetensors",
|
131 |
+
"decoder.block.15.layer.2.DenseReluDense.wi_1.weight": "model-00003-of-00007.safetensors",
|
132 |
+
"decoder.block.15.layer.2.DenseReluDense.wo.weight": "model-00003-of-00007.safetensors",
|
133 |
+
"decoder.block.16.layer.0.layer_norm.weight": "model-00004-of-00007.safetensors",
|
134 |
+
"decoder.block.16.layer.0.SelfAttention.k.weight": "model-00004-of-00007.safetensors",
|
135 |
+
"decoder.block.16.layer.0.SelfAttention.o.weight": "model-00004-of-00007.safetensors",
|
136 |
+
"decoder.block.16.layer.0.SelfAttention.q.weight": "model-00004-of-00007.safetensors",
|
137 |
+
"decoder.block.16.layer.0.SelfAttention.v.weight": "model-00004-of-00007.safetensors",
|
138 |
+
"decoder.block.16.layer.2.DenseReluDense.wi_0.weight": "model-00004-of-00007.safetensors",
|
139 |
+
"decoder.block.16.layer.2.DenseReluDense.wi_1.weight": "model-00004-of-00007.safetensors",
|
140 |
+
"decoder.block.16.layer.2.DenseReluDense.wo.weight": "model-00004-of-00007.safetensors",
|
141 |
+
"decoder.block.17.layer.0.layer_norm.weight": "model-00004-of-00007.safetensors",
|
142 |
+
"decoder.block.17.layer.0.SelfAttention.k.weight": "model-00004-of-00007.safetensors",
|
143 |
+
"decoder.block.17.layer.0.SelfAttention.o.weight": "model-00004-of-00007.safetensors",
|
144 |
+
"decoder.block.17.layer.0.SelfAttention.q.weight": "model-00004-of-00007.safetensors",
|
145 |
+
"decoder.block.17.layer.0.SelfAttention.v.weight": "model-00004-of-00007.safetensors",
|
146 |
+
"decoder.block.17.layer.2.DenseReluDense.wi_0.weight": "model-00004-of-00007.safetensors",
|
147 |
+
"decoder.block.17.layer.2.DenseReluDense.wi_1.weight": "model-00004-of-00007.safetensors",
|
148 |
+
"decoder.block.17.layer.2.DenseReluDense.wo.weight": "model-00004-of-00007.safetensors",
|
149 |
+
"decoder.block.18.layer.0.layer_norm.weight": "model-00004-of-00007.safetensors",
|
150 |
+
"decoder.block.18.layer.0.SelfAttention.k.weight": "model-00004-of-00007.safetensors",
|
151 |
+
"decoder.block.18.layer.0.SelfAttention.o.weight": "model-00004-of-00007.safetensors",
|
152 |
+
"decoder.block.18.layer.0.SelfAttention.q.weight": "model-00004-of-00007.safetensors",
|
153 |
+
"decoder.block.18.layer.0.SelfAttention.v.weight": "model-00004-of-00007.safetensors",
|
154 |
+
"decoder.block.18.layer.2.DenseReluDense.wi_0.weight": "model-00004-of-00007.safetensors",
|
155 |
+
"decoder.block.18.layer.2.DenseReluDense.wi_1.weight": "model-00004-of-00007.safetensors",
|
156 |
+
"decoder.block.18.layer.2.DenseReluDense.wo.weight": "model-00004-of-00007.safetensors",
|
157 |
+
"decoder.block.19.layer.0.layer_norm.weight": "model-00004-of-00007.safetensors",
|
158 |
+
"decoder.block.19.layer.0.SelfAttention.k.weight": "model-00004-of-00007.safetensors",
|
159 |
+
"decoder.block.19.layer.0.SelfAttention.o.weight": "model-00004-of-00007.safetensors",
|
160 |
+
"decoder.block.19.layer.0.SelfAttention.q.weight": "model-00004-of-00007.safetensors",
|
161 |
+
"decoder.block.19.layer.0.SelfAttention.v.weight": "model-00004-of-00007.safetensors",
|
162 |
+
"decoder.block.19.layer.2.DenseReluDense.wi_0.weight": "model-00004-of-00007.safetensors",
|
163 |
+
"decoder.block.19.layer.2.DenseReluDense.wi_1.weight": "model-00004-of-00007.safetensors",
|
164 |
+
"decoder.block.19.layer.2.DenseReluDense.wo.weight": "model-00004-of-00007.safetensors",
|
165 |
+
"decoder.block.20.layer.0.layer_norm.weight": "model-00004-of-00007.safetensors",
|
166 |
+
"decoder.block.20.layer.0.SelfAttention.k.weight": "model-00004-of-00007.safetensors",
|
167 |
+
"decoder.block.20.layer.0.SelfAttention.o.weight": "model-00004-of-00007.safetensors",
|
168 |
+
"decoder.block.20.layer.0.SelfAttention.q.weight": "model-00004-of-00007.safetensors",
|
169 |
+
"decoder.block.20.layer.0.SelfAttention.v.weight": "model-00004-of-00007.safetensors",
|
170 |
+
"decoder.block.20.layer.2.DenseReluDense.wi_0.weight": "model-00004-of-00007.safetensors",
|
171 |
+
"decoder.block.20.layer.2.DenseReluDense.wi_1.weight": "model-00004-of-00007.safetensors",
|
172 |
+
"decoder.block.20.layer.2.DenseReluDense.wo.weight": "model-00004-of-00007.safetensors",
|
173 |
+
"decoder.block.21.layer.0.layer_norm.weight": "model-00005-of-00007.safetensors",
|
174 |
+
"decoder.block.21.layer.0.SelfAttention.k.weight": "model-00005-of-00007.safetensors",
|
175 |
+
"decoder.block.21.layer.0.SelfAttention.o.weight": "model-00005-of-00007.safetensors",
|
176 |
+
"decoder.block.21.layer.0.SelfAttention.q.weight": "model-00005-of-00007.safetensors",
|
177 |
+
"decoder.block.21.layer.0.SelfAttention.v.weight": "model-00005-of-00007.safetensors",
|
178 |
+
"decoder.block.21.layer.2.DenseReluDense.wi_0.weight": "model-00005-of-00007.safetensors",
|
179 |
+
"decoder.block.21.layer.2.DenseReluDense.wi_1.weight": "model-00005-of-00007.safetensors",
|
180 |
+
"decoder.block.21.layer.2.DenseReluDense.wo.weight": "model-00005-of-00007.safetensors",
|
181 |
+
"decoder.block.22.layer.0.layer_norm.weight": "model-00005-of-00007.safetensors",
|
182 |
+
"decoder.block.22.layer.0.SelfAttention.k.weight": "model-00005-of-00007.safetensors",
|
183 |
+
"decoder.block.22.layer.0.SelfAttention.o.weight": "model-00005-of-00007.safetensors",
|
184 |
+
"decoder.block.22.layer.0.SelfAttention.q.weight": "model-00005-of-00007.safetensors",
|
185 |
+
"decoder.block.22.layer.0.SelfAttention.v.weight": "model-00005-of-00007.safetensors",
|
186 |
+
"decoder.block.22.layer.2.DenseReluDense.wi_0.weight": "model-00005-of-00007.safetensors",
|
187 |
+
"decoder.block.22.layer.2.DenseReluDense.wi_1.weight": "model-00005-of-00007.safetensors",
|
188 |
+
"decoder.block.22.layer.2.DenseReluDense.wo.weight": "model-00005-of-00007.safetensors",
|
189 |
+
"decoder.block.23.layer.0.layer_norm.weight": "model-00005-of-00007.safetensors",
|
190 |
+
"decoder.block.23.layer.0.SelfAttention.k.weight": "model-00005-of-00007.safetensors",
|
191 |
+
"decoder.block.23.layer.0.SelfAttention.o.weight": "model-00005-of-00007.safetensors",
|
192 |
+
"decoder.block.23.layer.0.SelfAttention.q.weight": "model-00005-of-00007.safetensors",
|
193 |
+
"decoder.block.23.layer.0.SelfAttention.v.weight": "model-00005-of-00007.safetensors",
|
194 |
+
"decoder.block.23.layer.2.DenseReluDense.wi_0.weight": "model-00005-of-00007.safetensors",
|
195 |
+
"decoder.block.23.layer.2.DenseReluDense.wi_1.weight": "model-00005-of-00007.safetensors",
|
196 |
+
"decoder.block.23.layer.2.DenseReluDense.wo.weight": "model-00005-of-00007.safetensors",
|
197 |
+
"decoder.block.24.layer.0.layer_norm.weight": "model-00005-of-00007.safetensors",
|
198 |
+
"decoder.block.24.layer.0.SelfAttention.k.weight": "model-00005-of-00007.safetensors",
|
199 |
+
"decoder.block.24.layer.0.SelfAttention.o.weight": "model-00005-of-00007.safetensors",
|
200 |
+
"decoder.block.24.layer.0.SelfAttention.q.weight": "model-00005-of-00007.safetensors",
|
201 |
+
"decoder.block.24.layer.0.SelfAttention.v.weight": "model-00005-of-00007.safetensors",
|
202 |
+
"decoder.block.24.layer.2.DenseReluDense.wi_0.weight": "model-00005-of-00007.safetensors",
|
203 |
+
"decoder.block.24.layer.2.DenseReluDense.wi_1.weight": "model-00005-of-00007.safetensors",
|
204 |
+
"decoder.block.24.layer.2.DenseReluDense.wo.weight": "model-00005-of-00007.safetensors",
|
205 |
+
"decoder.block.25.layer.0.layer_norm.weight": "model-00005-of-00007.safetensors",
|
206 |
+
"decoder.block.25.layer.0.SelfAttention.k.weight": "model-00005-of-00007.safetensors",
|
207 |
+
"decoder.block.25.layer.0.SelfAttention.o.weight": "model-00005-of-00007.safetensors",
|
208 |
+
"decoder.block.25.layer.0.SelfAttention.q.weight": "model-00005-of-00007.safetensors",
|
209 |
+
"decoder.block.25.layer.0.SelfAttention.v.weight": "model-00005-of-00007.safetensors",
|
210 |
+
"decoder.block.25.layer.2.DenseReluDense.wi_0.weight": "model-00005-of-00007.safetensors",
|
211 |
+
"decoder.block.25.layer.2.DenseReluDense.wi_1.weight": "model-00005-of-00007.safetensors",
|
212 |
+
"decoder.block.25.layer.2.DenseReluDense.wo.weight": "model-00005-of-00007.safetensors",
|
213 |
+
"decoder.block.26.layer.0.layer_norm.weight": "model-00006-of-00007.safetensors",
|
214 |
+
"decoder.block.26.layer.0.SelfAttention.k.weight": "model-00006-of-00007.safetensors",
|
215 |
+
"decoder.block.26.layer.0.SelfAttention.o.weight": "model-00006-of-00007.safetensors",
|
216 |
+
"decoder.block.26.layer.0.SelfAttention.q.weight": "model-00006-of-00007.safetensors",
|
217 |
+
"decoder.block.26.layer.0.SelfAttention.v.weight": "model-00006-of-00007.safetensors",
|
218 |
+
"decoder.block.26.layer.2.DenseReluDense.wi_0.weight": "model-00006-of-00007.safetensors",
|
219 |
+
"decoder.block.26.layer.2.DenseReluDense.wi_1.weight": "model-00006-of-00007.safetensors",
|
220 |
+
"decoder.block.26.layer.2.DenseReluDense.wo.weight": "model-00006-of-00007.safetensors",
|
221 |
+
"decoder.block.27.layer.0.layer_norm.weight": "model-00006-of-00007.safetensors",
|
222 |
+
"decoder.block.27.layer.0.SelfAttention.k.weight": "model-00006-of-00007.safetensors",
|
223 |
+
"decoder.block.27.layer.0.SelfAttention.o.weight": "model-00006-of-00007.safetensors",
|
224 |
+
"decoder.block.27.layer.0.SelfAttention.q.weight": "model-00006-of-00007.safetensors",
|
225 |
+
"decoder.block.27.layer.0.SelfAttention.v.weight": "model-00006-of-00007.safetensors",
|
226 |
+
"decoder.block.27.layer.2.DenseReluDense.wi_0.weight": "model-00006-of-00007.safetensors",
|
227 |
+
"decoder.block.27.layer.2.DenseReluDense.wi_1.weight": "model-00006-of-00007.safetensors",
|
228 |
+
"decoder.block.27.layer.2.DenseReluDense.wo.weight": "model-00006-of-00007.safetensors",
|
229 |
+
"decoder.block.28.layer.0.layer_norm.weight": "model-00006-of-00007.safetensors",
|
230 |
+
"decoder.block.28.layer.0.SelfAttention.k.weight": "model-00006-of-00007.safetensors",
|
231 |
+
"decoder.block.28.layer.0.SelfAttention.o.weight": "model-00006-of-00007.safetensors",
|
232 |
+
"decoder.block.28.layer.0.SelfAttention.q.weight": "model-00006-of-00007.safetensors",
|
233 |
+
"decoder.block.28.layer.0.SelfAttention.v.weight": "model-00006-of-00007.safetensors",
|
234 |
+
"decoder.block.28.layer.2.DenseReluDense.wi_0.weight": "model-00006-of-00007.safetensors",
|
235 |
+
"decoder.block.28.layer.2.DenseReluDense.wi_1.weight": "model-00006-of-00007.safetensors",
|
236 |
+
"decoder.block.28.layer.2.DenseReluDense.wo.weight": "model-00006-of-00007.safetensors",
|
237 |
+
"decoder.block.29.layer.0.layer_norm.weight": "model-00006-of-00007.safetensors",
|
238 |
+
"decoder.block.29.layer.0.SelfAttention.k.weight": "model-00006-of-00007.safetensors",
|
239 |
+
"decoder.block.29.layer.0.SelfAttention.o.weight": "model-00006-of-00007.safetensors",
|
240 |
+
"decoder.block.29.layer.0.SelfAttention.q.weight": "model-00006-of-00007.safetensors",
|
241 |
+
"decoder.block.29.layer.0.SelfAttention.v.weight": "model-00006-of-00007.safetensors",
|
242 |
+
"decoder.block.29.layer.2.DenseReluDense.wi_0.weight": "model-00006-of-00007.safetensors",
|
243 |
+
"decoder.block.29.layer.2.DenseReluDense.wi_1.weight": "model-00006-of-00007.safetensors",
|
244 |
+
"decoder.block.29.layer.2.DenseReluDense.wo.weight": "model-00006-of-00007.safetensors",
|
245 |
+
"decoder.block.30.layer.0.layer_norm.weight": "model-00006-of-00007.safetensors",
|
246 |
+
"decoder.block.30.layer.0.SelfAttention.k.weight": "model-00006-of-00007.safetensors",
|
247 |
+
"decoder.block.30.layer.0.SelfAttention.o.weight": "model-00006-of-00007.safetensors",
|
248 |
+
"decoder.block.30.layer.0.SelfAttention.q.weight": "model-00006-of-00007.safetensors",
|
249 |
+
"decoder.block.30.layer.0.SelfAttention.v.weight": "model-00006-of-00007.safetensors",
|
250 |
+
"decoder.block.30.layer.2.DenseReluDense.wi_0.weight": "model-00006-of-00007.safetensors",
|
251 |
+
"decoder.block.30.layer.2.DenseReluDense.wi_1.weight": "model-00006-of-00007.safetensors",
|
252 |
+
"decoder.block.30.layer.2.DenseReluDense.wo.weight": "model-00006-of-00007.safetensors",
|
253 |
+
"decoder.block.31.layer.0.layer_norm.weight": "model-00007-of-00007.safetensors",
|
254 |
+
"decoder.block.31.layer.0.SelfAttention.k.weight": "model-00007-of-00007.safetensors",
|
255 |
+
"decoder.block.31.layer.0.SelfAttention.o.weight": "model-00007-of-00007.safetensors",
|
256 |
+
"decoder.block.31.layer.0.SelfAttention.q.weight": "model-00007-of-00007.safetensors",
|
257 |
+
"decoder.block.31.layer.0.SelfAttention.v.weight": "model-00007-of-00007.safetensors",
|
258 |
+
"decoder.block.31.layer.2.DenseReluDense.wi_0.weight": "model-00007-of-00007.safetensors",
|
259 |
+
"decoder.block.31.layer.2.DenseReluDense.wi_1.weight": "model-00007-of-00007.safetensors",
|
260 |
+
"decoder.block.31.layer.2.DenseReluDense.wo.weight": "model-00007-of-00007.safetensors"
|
261 |
+
}
|
262 |
+
}
|
special_tokens_map.json
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eos_token": {
|
3 |
+
"content": "</s>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": false,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"pad_token": {
|
10 |
+
"content": "<s>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"unk_token": {
|
17 |
+
"content": "<unk>",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
}
|
23 |
+
}
|
spiece.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ef11ac9a22c7503492f56d48dce53be20e339b63605983e9f27d2cd0e0f3922c
|
3 |
+
size 4427844
|
tokenizer.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a2799ccc696b752ba00c34f58726bfe253a04921ceb6cfc620400f560474790b
|
3 |
+
size 16629031
|
tokenizer_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"added_tokens_decoder": {
|
3 |
+
"0": {
|
4 |
+
"content": "<unk>",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false,
|
9 |
+
"special": true
|
10 |
+
},
|
11 |
+
"1": {
|
12 |
+
"content": "<s>",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false,
|
17 |
+
"special": true
|
18 |
+
},
|
19 |
+
"2": {
|
20 |
+
"content": "</s>",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false,
|
25 |
+
"special": true
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"additional_special_tokens": [],
|
29 |
+
"clean_up_tokenization_spaces": true,
|
30 |
+
"eos_token": "</s>",
|
31 |
+
"extra_ids": 0,
|
32 |
+
"legacy": false,
|
33 |
+
"model_max_length": 1000000000000000019884624838656,
|
34 |
+
"pad_token": "<s>",
|
35 |
+
"sp_model_kwargs": {},
|
36 |
+
"tokenizer_class": "T5Tokenizer",
|
37 |
+
"unk_token": "<unk>"
|
38 |
+
}
|