mt5 config

by jbochi - opened Oct 11, 2023

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+513488

-722

Files changed (14) hide show

.gitattributes +0 -4
README.md +3 -621
added_tokens.json +0 -2
config.json +22 -27
model-q4k.gguf → flax_model.msgpack +2 -2
generation_config.json +2 -2
model.safetensors +2 -2
model-q2k.gguf → pytorch_model-00001-of-00002.bin +2 -2
model-q3k.gguf → pytorch_model-00002-of-00002.bin +2 -2
pytorch_model.bin.index.json +751 -0
special_tokens_map.json +105 -21
spiece.model +2 -2
tokenizer.json +0 -0
tokenizer_config.json +104 -29

.gitattributes CHANGED Viewed

@@ -33,7 +33,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
-tokenizer.json filter=lfs diff=lfs merge=lfs -text
-model-q4k.gguf filter=lfs diff=lfs merge=lfs -text
-model-q2k.gguf filter=lfs diff=lfs merge=lfs -text
-model-q3k.gguf filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,633 +1,15 @@
 ---
 license: apache-2.0
 language:
-- multilingual
 - en
-- ru
-- es
-- fr
-- de
-- it
-- pt
-- pl
-- nl
-- vi
-- tr
-- sv
-- id
-- ro
-- cs
-- zh
-- hu
-- ja
-- th
-- fi
-- fa
-- uk
-- da
-- el
-- "no"
-- bg
-- sk
-- ko
-- ar
-- lt
-- ca
-- sl
-- he
-- et
-- lv
-- hi
-- sq
-- ms
-- az
-- sr
-- ta
-- hr
-- kk
-- is
-- ml
-- mr
-- te
-- af
-- gl
-- fil
-- be
-- mk
-- eu
-- bn
-- ka
-- mn
-- bs
-- uz
-- ur
-- sw
-- yue
-- ne
-- kn
-- kaa
-- gu
-- si
-- cy
-- eo
-- la
-- hy
-- ky
-- tg
-- ga
-- mt
-- my
-- km
-- tt
-- so
-- ku
-- ps
-- pa
-- rw
-- lo
-- ha
-- dv
-- fy
-- lb
-- ckb
-- mg
-- gd
-- am
-- ug
-- ht
-- grc
-- hmn
-- sd
-- jv
-- mi
-- tk
-- ceb
-- yi
-- ba
-- fo
-- or
-- xh
-- su
-- kl
-- ny
-- sm
-- sn
-- co
-- zu
-- ig
-- yo
-- pap
-- st
-- haw
-- as
-- oc
-- cv
-- lus
-- tet
-- gsw
-- sah
-- br
-- rm
-- sa
-- bo
-- om
-- se
-- ce
-- cnh
-- ilo
-- hil
-- udm
-- os
-- lg
-- ti
-- vec
-- ts
-- tyv
-- kbd
-- ee
-- iba
-- av
-- kha
-- to
-- tn
-- nso
-- fj
-- zza
-- ak
-- ada
-- otq
-- dz
-- bua
-- cfm
-- ln
-- chm
-- gn
-- krc
-- wa
-- hif
-- yua
-- srn
-- war
-- rom
-- bik
-- pam
-- sg
-- lu
-- ady
-- kbp
-- syr
-- ltg
-- myv
-- iso
-- kac
-- bho
-- ay
-- kum
-- qu
-- za
-- pag
-- ngu
-- ve
-- pck
-- zap
-- tyz
-- hui
-- bbc
-- tzo
-- tiv
-- ksd
-- gom
-- min
-- ang
-- nhe
-- bgp
-- nzi
-- nnb
-- nv
-- zxx
-- bci
-- kv
-- new
-- mps
-- alt
-- meu
-- bew
-- fon
-- iu
-- abt
-- mgh
-- mnw
-- tvl
-- dov
-- tlh
-- ho
-- kw
-- mrj
-- meo
-- crh
-- mbt
-- emp
-- ace
-- ium
-- mam
-- gym
-- mai
-- crs
-- pon
-- ubu
-- fip
-- quc
-- gv
-- kj
-- btx
-- ape
-- chk
-- rcf
-- shn
-- tzh
-- mdf
-- ppk
-- ss
-- gag
-- cab
-- kri
-- seh
-- ibb
-- tbz
-- bru
-- enq
-- ach
-- cuk
-- kmb
-- wo
-- kek
-- qub
-- tab
-- bts
-- kos
-- rwo
-- cak
-- tuc
-- bum
-- cjk
-- gil
-- stq
-- tsg
-- quh
-- mak
-- arn
-- ban
-- jiv
-- sja
-- yap
-- tcy
-- toj
-- twu
-- xal
-- amu
-- rmc
-- hus
-- nia
-- kjh
-- bm
-- guh
-- mas
-- acf
-- dtp
-- ksw
-- bzj
-- din
-- zne
-- mad
-- msi
-- mag
-- mkn
-- kg
-- lhu
-- ch
-- qvi
-- mh
-- djk
-- sus
-- mfe
-- srm
-- dyu
-- ctu
-- gui
-- pau
-- inb
-- bi
-- mni
-- guc
-- jam
-- wal
-- jac
-- bas
-- gor
-- skr
-- nyu
-- noa
-- sda
-- gub
-- nog
-- cni
-- teo
-- tdx
-- sxn
-- rki
-- nr
-- frp
-- alz
-- taj
-- lrc
-- cce
-- rn
-- jvn
-- hvn
-- nij
-- dwr
-- izz
-- msm
-- bus
-- ktu
-- chr
-- maz
-- tzj
-- suz
-- knj
-- bim
-- gvl
-- bqc
-- tca
-- pis
-- prk
-- laj
-- mel
-- qxr
-- niq
-- ahk
-- shp
-- hne
-- spp
-- koi
-- krj
-- quf
-- luz
-- agr
-- tsc
-- mqy
-- gof
-- gbm
-- miq
-- dje
-- awa
-- bjj
-- qvz
-- sjp
-- tll
-- raj
-- kjg
-- bgz
-- quy
-- cbk
-- akb
-- oj
-- ify
-- mey
-- ks
-- cac
-- brx
-- qup
-- syl
-- jax
-- ff
-- ber
-- tks
-- trp
-- mrw
-- adh
-- smt
-- srr
-- ffm
-- qvc
-- mtr
-- ann
-- kaa
-- aa
-- noe
-- nut
-- gyn
-- kwi
-- xmm
-- msb
 library_name: transformers
 tags:
-- text2text-generation
 - text-generation-inference
-datasets:
-- allenai/MADLAD-400
-pipeline_tag: translation
-widget:
-- text: "<2en> Como vai, amigo?"
-  example_title: "Translation to English"
-- text: "<2de> Do you speak German?"
-  example_title: "Translation to German"
 ---
-# Model Card for MADLAD-400-3B-MT
-#  Table of Contents
-0. [TL;DR](#TL;DR)
-1. [Model Details](#model-details)
-2. [Usage](#usage)
-3. [Uses](#uses)
-4. [Bias, Risks, and Limitations](#bias-risks-and-limitations)
-5. [Training Details](#training-details)
-6. [Evaluation](#evaluation)
-7. [Environmental Impact](#environmental-impact)
-8. [Citation](#citation)
-# TL;DR
-MADLAD-400-3B-MT is a multilingual machine translation model based on the T5 architecture that was
-trained on 1 trillion tokens covering over 450 languages using publicly available data.
-It is competitive with models that are significantly larger.
-**Disclaimer**: [Juarez Bochi](https://huggingface.co/jbochi), who was not involved in this research, converted
-the original weights and wrote the contents of this model card based on the original paper and Flan-T5.
-# Model Details
-## Model Description
-- **Model type:** Language model
-- **Language(s) (NLP):** Multilingual (400+ languages)
-- **License:** Apache 2.0
-- **Related Models:** [All MADLAD-400 Checkpoints](https://huggingface.co/models?search=madlad)
-- **Original Checkpoints:** [All Original MADLAD-400 Checkpoints](https://github.com/google-research/google-research/tree/master/madlad_400)
-- **Resources for more information:**
-  - [Research paper](https://arxiv.org/abs/2309.04662)
-  - [GitHub Repo](https://github.com/google-research/t5x)
-  - [Hugging Face MADLAD-400 Docs (Similar to T5) ](https://huggingface.co/docs/transformers/model_doc/MADLAD-400) - [Pending PR](https://github.com/huggingface/transformers/pull/27471)
-# Usage
-Find below some example scripts on how to use the model:
-## Using the Pytorch model with `transformers`
-### Running the model on a CPU or GPU
-<details>
-<summary> Click to expand </summary>
-First, install the Python packages that are required:
-`pip install transformers accelerate sentencepiece protobuf`
-```python
-from transformers import T5ForConditionalGeneration, T5Tokenizer
-model_name = 'jbochi/madlad400-3b-mt'
-model = T5ForConditionalGeneration.from_pretrained(model_name, device_map="auto")
-tokenizer = T5Tokenizer.from_pretrained(model_name)
-text = "<2pt> I love pizza!"
-input_ids = tokenizer(text, return_tensors="pt").input_ids.to(model.device)
-outputs = model.generate(input_ids=input_ids)
-tokenizer.decode(outputs[0], skip_special_tokens=True)
-# Eu adoro pizza!
-```
-</details>
-## Running the model with Candle
-<details>
-<summary> Click to expand </summary>
-Usage with [candle](https://github.com/huggingface/candle):
-```bash
-$ cargo run --example t5 --release  -- \
-  --model-id "jbochi/madlad400-3b-mt" \
-  --prompt "<2de> How are you, my friend?" \
-  --decode --temperature 0
-```
-We also provide a quantized model (1.65 GB vs the original 11.8 GB file):
-```
-cargo run --example quantized-t5 --release  -- \
-  --model-id "jbochi/madlad400-3b-mt" --weight-file "model-q4k.gguf" \
-  --prompt "<2de> How are you, my friend?" \
-  --temperature 0
-...
- Wie geht es dir, mein Freund?
-```
-</details>
-# Uses
-## Direct Use and Downstream Use
-> Primary intended uses: Machine Translation and multilingual NLP tasks on over 400 languages.
-> Primary intended users: Research community.
-## Out-of-Scope Use
-> These models are trained on general domain data and are therefore not meant to
-> work on domain-specific models out-of-the box. Moreover, these research models have not been assessed
-> for production usecases.
-# Bias, Risks, and Limitations
-> We note that we evaluate on only 204 of the languages supported by these models and on machine translation
-> and few-shot machine translation tasks. Users must consider use of this model carefully for their own
-> usecase.
-## Ethical considerations and risks
-> We trained these models with MADLAD-400 and publicly available data to create baseline models that
-> support NLP for over 400 languages, with a focus on languages underrepresented in large-scale corpora.
-> Given that these models were trained with web-crawled datasets that may contain sensitive, offensive or
-> otherwise low-quality content despite extensive preprocessing, it is still possible that these issues to the
-> underlying training data may cause differences in model performance and toxic (or otherwise problematic)
-> output for certain domains. Moreover, large models are dual use technologies that have specific risks
-> associated with their use and development. We point the reader to surveys such as those written by
-> Weidinger et al. or Bommasani et al. for a more detailed discussion of these risks, and to Liebling
-> et al. for a thorough discussion of the risks of machine translation systems.
-## Known Limitations
-More information needed
-## Sensitive Use:
-More information needed
-# Training Details
-> We train models of various sizes: a 3B, 32-layer parameter model,
-> a 7.2B 48-layer parameter model and a 10.7B 32-layer parameter model.
-> We share all parameters of the model across language pairs,
-> and use a Sentence Piece Model with 256k tokens shared on both the encoder and decoder
-> side. Each input sentence has a <2xx> token prepended to the source sentence to indicate the target
-> language.
-See the [research paper](https://arxiv.org/pdf/2309.04662.pdf) for further details.
-## Training Data
-> For both the machine translation and language model, MADLAD-400 is used. For the machine translation
-> model, a combination of parallel datasources covering 157 languages is also used. Further details are
-> described in the [paper](https://arxiv.org/pdf/2309.04662.pdf).
-## Training Procedure
-See the [research paper](https://arxiv.org/pdf/2309.04662.pdf) for further details.
-# Evaluation
-## Testing Data, Factors & Metrics
-> For evaluation, we used WMT, NTREX, Flores-200 and Gatones datasets as described in Section 4.3 in the [paper](https://arxiv.org/pdf/2309.04662.pdf).
-> The translation quality of this model varies based on language, as seen in the paper, and likely varies on
-> domain, though we have not assessed this.
-## Results
-![image/png](https://cdn-uploads.huggingface.co/production/uploads/64b7f632037d6452a321fa15/EzsMD1AwCuFH0S0DeD-n8.png)
-![image/png](https://cdn-uploads.huggingface.co/production/uploads/64b7f632037d6452a321fa15/CJ5zCUVy7vTU76Lc8NZcK.png)
-![image/png](https://cdn-uploads.huggingface.co/production/uploads/64b7f632037d6452a321fa15/NK0S-yVeWuhKoidpLYh3m.png)
-See the [research paper](https://arxiv.org/pdf/2309.04662.pdf) for further details.
-# Environmental Impact
-More information needed
-# Citation
-**BibTeX:**
-```bibtex
-@misc{kudugunta2023madlad400,
-      title={MADLAD-400: A Multilingual And Document-Level Large Audited Dataset},
-      author={Sneha Kudugunta and Isaac Caswell and Biao Zhang and Xavier Garcia and Christopher A. Choquette-Choo and Katherine Lee and Derrick Xin and Aditya Kusupati and Romi Stella and Ankur Bapna and Orhan Firat},
-      year={2023},
-      eprint={2309.04662},
-      archivePrefix={arXiv},
-      primaryClass={cs.CL}
-}
-```

 ---
 license: apache-2.0
 language:
 - en
 library_name: transformers
 tags:
 - text-generation-inference
 ---
+T5ForConditionalGeneration files for the [Madlad-400](https://github.com/google-research/google-research/tree/master/madlad_400) 3B parameter MT model.
+**This is a WIP. I'm able to load and run the model, but output is nonsensical.**
+Colab to generate these files is [here](https://colab.research.google.com/drive/1rZ2NRyl2zwmg0sQ2Wi-uZZF48iVYulTC#scrollTo=pVODoE6gA9sw).

added_tokens.json DELETED Viewed

	@@ -1,2 +0,0 @@
1	- {
2	- }

config.json CHANGED Viewed

@@ -1,29 +1,24 @@
 {
-    "architectures": [
-        "T5ForConditionalGeneration"
-    ],
-    "d_ff": 8192,
-    "d_kv": 128,
-    "d_model": 1024,
-    "dropout_rate": 0.1,
-    "decoder_start_token_id": 0,
-    "pad_token_id": 1,
-    "eos_token_id": 2,
-    "feed_forward_proj": "gated-gelu",
-    "initializer_factor": 1.0,
-    "is_encoder_decoder": true,
-    "layer_norm_epsilon": 1e-06,
-    "model_type": "t5",
-    "n_positions": 512,
-    "num_decoder_layers": 32,
-    "num_heads": 16,
-    "num_layers": 32,
-    "output_past": true,
-    "relative_attention_max_distance": 128,
-    "relative_attention_num_buckets": 32,
-    "task_specific_params": {},
-    "tie_word_embeddings": false,
-    "transformers_version": "4.23.1",
-    "use_cache": true,
-    "vocab_size": 256000
 }

 {
+  "architectures": [
+    "MT5ForConditionalGeneration"
+  ],
+  "d_ff": 8192,
+  "d_kv": 128,
+  "d_model": 1024,
+  "decoder_start_token_id": 1,
+  "dropout_rate": 0.1,
+  "eos_token_id": 2,
+  "feed_forward_proj": "gated-gelu",
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "mt5",
+  "num_decoder_layers": 32,
+  "num_heads": 16,
+  "num_layers": 32,
+  "pad_token_id": 1,
+  "relative_attention_num_buckets": 32,
+  "tie_word_embeddings": false,
+  "tokenizer_class": "T5Tokenizer",
+  "vocab_size": 256000
 }

model-q4k.gguf → flax_model.msgpack RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ea6e5531a3e95213c7f0635988d119e078a655c09306e47851e15d4c0c3f9c37
-size 1654597280

 version https://git-lfs.github.com/spec/v1
+oid sha256:9dc94c6dc47a8e24d50b810d3ece0ae3b78b66ef310d053557b4ffe8ad6b1b77
+size 11761528083

generation_config.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "_from_model_config": true,
-  "decoder_start_token_id": 0,
   "eos_token_id": 2,
   "pad_token_id": 1,
-  "transformers_version": "4.35.0"
 }

 {
   "_from_model_config": true,
+  "decoder_start_token_id": 1,
   "eos_token_id": 2,
   "pad_token_id": 1,
+  "transformers_version": "4.33.2"
 }

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:66ff5f8fcaf92291da486fdfbd4d5233cec90e1359348a56e3172c978b3a76d4
-size 11761587872

 version https://git-lfs.github.com/spec/v1
+oid sha256:1ddca188b8a3db3666eae1b7176f9bbd20a6b378efc3dbc2a823564c5b9d8e7c
+size 12810163832

model-q2k.gguf → pytorch_model-00001-of-00002.bin RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:abab3b4cf39c076b94517e5b7f53bf43a70e72032d787dde948c56a1c597df5d
-size 965486240

 version https://git-lfs.github.com/spec/v1
+oid sha256:6a28403955d7d7699a5745c2b6e4f02eee8962d85d5e4b98e509329c72b9f607
+size 9974907514

model-q3k.gguf → pytorch_model-00002-of-00002.bin RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6d9850c77c4610037085b2595f5cfde13a23a70296596209d8d61b7cc0d34e6e
-size 1264101024

 version https://git-lfs.github.com/spec/v1
+oid sha256:3368d731cf5b286d309f44ccae713abdc074ac71668b0a13ba533968f4ad6972
+size 1786851801

pytorch_model.bin.index.json ADDED Viewed

	@@ -0,0 +1,751 @@

+{
+  "metadata": {
+    "total_size": 11761496064
+  },
+  "weight_map": {
+    "decoder.block.0.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.0.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.0.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.0.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.0.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.0.layer.1.EncDecAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.0.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.0.layer.1.EncDecAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.0.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.0.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.0.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.0.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.0.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.0.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.1.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.1.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.1.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.1.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.1.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.1.layer.1.EncDecAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.1.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.1.layer.1.EncDecAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.1.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.1.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.1.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.1.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.1.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.1.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.10.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.10.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.10.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.10.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.10.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.10.layer.1.EncDecAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.10.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.10.layer.1.EncDecAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.10.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.10.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.10.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.10.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.10.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.10.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.11.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.11.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.11.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.11.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.11.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.11.layer.1.EncDecAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.11.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.11.layer.1.EncDecAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.11.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.11.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.11.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.11.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.11.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.11.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.12.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.12.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.12.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.12.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.12.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.12.layer.1.EncDecAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.12.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.12.layer.1.EncDecAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.12.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.12.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.12.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.12.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.12.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.12.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.13.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.13.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.13.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.13.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.13.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.13.layer.1.EncDecAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.13.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.13.layer.1.EncDecAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.13.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.13.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.13.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.13.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.13.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.13.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.14.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.14.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.14.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.14.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.14.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.14.layer.1.EncDecAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.14.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.14.layer.1.EncDecAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.14.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.14.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.14.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.14.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.14.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.14.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.15.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.15.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.15.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.15.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.15.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.15.layer.1.EncDecAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.15.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.15.layer.1.EncDecAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.15.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.15.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.15.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.15.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.15.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.15.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.16.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.16.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.16.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.16.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.16.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.16.layer.1.EncDecAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.16.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.16.layer.1.EncDecAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.16.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.16.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.16.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.16.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.16.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.16.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.17.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.17.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.17.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.17.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.17.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.17.layer.1.EncDecAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.17.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.17.layer.1.EncDecAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.17.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.17.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.17.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.17.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.17.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.17.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.18.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.18.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.18.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.18.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.18.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.18.layer.1.EncDecAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.18.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.18.layer.1.EncDecAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.18.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.18.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.18.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.18.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.18.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.18.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.19.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.19.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.19.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.19.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.19.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.19.layer.1.EncDecAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.19.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.19.layer.1.EncDecAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.19.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.19.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.19.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.19.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.19.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.19.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.2.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.2.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.2.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.2.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.2.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.2.layer.1.EncDecAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.2.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.2.layer.1.EncDecAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.2.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.2.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.2.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.2.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.2.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.2.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.20.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.20.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.20.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.20.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.20.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.20.layer.1.EncDecAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.20.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.20.layer.1.EncDecAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.20.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.20.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.20.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.20.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.20.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.20.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.21.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.21.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.21.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.21.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.21.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.21.layer.1.EncDecAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.21.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.21.layer.1.EncDecAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.21.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.21.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.21.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.21.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.21.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.21.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.22.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.22.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.22.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.22.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.22.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.22.layer.1.EncDecAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.22.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.22.layer.1.EncDecAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.22.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.22.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.22.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.22.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.22.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.22.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.23.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.23.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.23.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.23.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.23.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.23.layer.1.EncDecAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.23.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.23.layer.1.EncDecAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.23.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.23.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.23.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.23.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.23.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.23.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.24.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.24.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.24.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.24.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.24.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.24.layer.1.EncDecAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.24.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.24.layer.1.EncDecAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.24.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.24.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.24.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.24.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.24.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.24.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.25.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.25.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.25.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.25.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.25.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.25.layer.1.EncDecAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.25.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.25.layer.1.EncDecAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.25.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.25.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.25.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.25.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.25.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.25.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.26.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.26.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.26.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.26.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.26.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.26.layer.1.EncDecAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.26.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.26.layer.1.EncDecAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.26.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.26.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.26.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.26.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.26.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.26.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.27.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.27.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.27.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.27.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.27.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.27.layer.1.EncDecAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.27.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.27.layer.1.EncDecAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.27.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.27.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.27.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.27.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.27.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.27.layer.2.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.28.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.28.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.28.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.28.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.28.layer.0.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.28.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.28.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.28.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.28.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.28.layer.1.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.28.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.28.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.28.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.28.layer.2.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.29.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.29.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.29.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.29.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.29.layer.0.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.29.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.29.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.29.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.29.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.29.layer.1.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.29.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.29.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.29.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.29.layer.2.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.3.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.3.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.3.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.3.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.3.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.3.layer.1.EncDecAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.3.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.3.layer.1.EncDecAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.3.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.3.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.3.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.3.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.3.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.3.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.30.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.30.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.30.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.30.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.30.layer.0.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.30.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.30.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.30.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.30.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.30.layer.1.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.30.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.30.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.30.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.30.layer.2.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.31.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.31.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.31.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.31.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.31.layer.0.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.31.layer.1.EncDecAttention.k.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.31.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.31.layer.1.EncDecAttention.q.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.31.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.31.layer.1.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.31.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.31.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.31.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.31.layer.2.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.block.4.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.4.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.4.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.4.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.4.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.4.layer.1.EncDecAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.4.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.4.layer.1.EncDecAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.4.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.4.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.4.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.4.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.4.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.4.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.5.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.5.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.5.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.5.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.5.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.5.layer.1.EncDecAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.5.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.5.layer.1.EncDecAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.5.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.5.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.5.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.5.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.5.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.5.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.6.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.6.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.6.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.6.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.6.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.6.layer.1.EncDecAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.6.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.6.layer.1.EncDecAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.6.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.6.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.6.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.6.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.6.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.6.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.7.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.7.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.7.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.7.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.7.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.7.layer.1.EncDecAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.7.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.7.layer.1.EncDecAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.7.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.7.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.7.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.7.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.7.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.7.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.8.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.8.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.8.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.8.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.8.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.8.layer.1.EncDecAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.8.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.8.layer.1.EncDecAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.8.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.8.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.8.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.8.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.8.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.8.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.9.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.9.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.9.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.9.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.9.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.9.layer.1.EncDecAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.9.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.9.layer.1.EncDecAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.9.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.9.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.9.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.9.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.9.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.block.9.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.embed_tokens.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.final_layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "encoder.block.0.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.0.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.0.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.0.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.0.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.0.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.0.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.0.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.0.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.1.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.1.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.1.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.1.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.1.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.1.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.1.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.1.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.1.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.10.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.10.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.10.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.10.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.10.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.10.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.10.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.10.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.10.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.11.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.11.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.11.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.11.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.11.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.11.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.11.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.11.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.11.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.12.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.12.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.12.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.12.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.12.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.12.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.12.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.12.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.12.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.13.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.13.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.13.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.13.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.13.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.13.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.13.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.13.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.13.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.14.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.14.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.14.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.14.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.14.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.14.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.14.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.14.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.14.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.15.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.15.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.15.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.15.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.15.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.15.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.15.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.15.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.15.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.16.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.16.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.16.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.16.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.16.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.16.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.16.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.16.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.16.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.17.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.17.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.17.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.17.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.17.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.17.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.17.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.17.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.17.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.18.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.18.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.18.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.18.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.18.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.18.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.18.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.18.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.18.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.19.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.19.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.19.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.19.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.19.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.19.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.19.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.19.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.19.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.2.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.2.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.2.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.2.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.2.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.2.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.2.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.2.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.2.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.20.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.20.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.20.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.20.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.20.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.20.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.20.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.20.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.20.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.21.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.21.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.21.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.21.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.21.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.21.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.21.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.21.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.21.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.22.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.22.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.22.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.22.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.22.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.22.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.22.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.22.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.22.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.23.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.23.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.23.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.23.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.23.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.23.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.23.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.23.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.23.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.24.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.24.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.24.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.24.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.24.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.24.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.24.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.24.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.24.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.25.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.25.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.25.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.25.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.25.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.25.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.25.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.25.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.25.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.26.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.26.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.26.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.26.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.26.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.26.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.26.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.26.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.26.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.27.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.27.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.27.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.27.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.27.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.27.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.27.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.27.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.27.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.28.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.28.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.28.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.28.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.28.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.28.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.28.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.28.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.28.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.29.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.29.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.29.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.29.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.29.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.29.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.29.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.29.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.29.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.3.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.3.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.3.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.3.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.3.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.3.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.3.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.3.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.3.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.30.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.30.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.30.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.30.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.30.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.30.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.30.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.30.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.30.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.31.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.31.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.31.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.31.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.31.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.31.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.31.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.31.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.31.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.4.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.4.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.4.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.4.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.4.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.4.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.4.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.4.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.4.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.5.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.5.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.5.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.5.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.5.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.5.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.5.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.5.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.5.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.6.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.6.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.6.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.6.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.6.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.6.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.6.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.6.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.6.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.7.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.7.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.7.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.7.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.7.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.7.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.7.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.7.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.7.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.8.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.8.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.8.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.8.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.8.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.8.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.8.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.8.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.8.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.9.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.9.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.9.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.9.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.9.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.9.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.9.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.9.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.block.9.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.embed_tokens.weight": "pytorch_model-00001-of-00002.bin",
+    "encoder.final_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "lm_head.weight": "pytorch_model-00002-of-00002.bin",
+    "shared.weight": "pytorch_model-00001-of-00002.bin"
+  }
+}

special_tokens_map.json CHANGED Viewed

@@ -1,23 +1,107 @@
 {
-  "eos_token": {
-    "content": "</s>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  },
-  "pad_token": {
-    "content": "<s>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  },
-  "unk_token": {
-    "content": "<unk>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  }
 }

 {
+  "additional_special_tokens": [
+    "<extra_id_0>",
+    "<extra_id_1>",
+    "<extra_id_2>",
+    "<extra_id_3>",
+    "<extra_id_4>",
+    "<extra_id_5>",
+    "<extra_id_6>",
+    "<extra_id_7>",
+    "<extra_id_8>",
+    "<extra_id_9>",
+    "<extra_id_10>",
+    "<extra_id_11>",
+    "<extra_id_12>",
+    "<extra_id_13>",
+    "<extra_id_14>",
+    "<extra_id_15>",
+    "<extra_id_16>",
+    "<extra_id_17>",
+    "<extra_id_18>",
+    "<extra_id_19>",
+    "<extra_id_20>",
+    "<extra_id_21>",
+    "<extra_id_22>",
+    "<extra_id_23>",
+    "<extra_id_24>",
+    "<extra_id_25>",
+    "<extra_id_26>",
+    "<extra_id_27>",
+    "<extra_id_28>",
+    "<extra_id_29>",
+    "<extra_id_30>",
+    "<extra_id_31>",
+    "<extra_id_32>",
+    "<extra_id_33>",
+    "<extra_id_34>",
+    "<extra_id_35>",
+    "<extra_id_36>",
+    "<extra_id_37>",
+    "<extra_id_38>",
+    "<extra_id_39>",
+    "<extra_id_40>",
+    "<extra_id_41>",
+    "<extra_id_42>",
+    "<extra_id_43>",
+    "<extra_id_44>",
+    "<extra_id_45>",
+    "<extra_id_46>",
+    "<extra_id_47>",
+    "<extra_id_48>",
+    "<extra_id_49>",
+    "<extra_id_50>",
+    "<extra_id_51>",
+    "<extra_id_52>",
+    "<extra_id_53>",
+    "<extra_id_54>",
+    "<extra_id_55>",
+    "<extra_id_56>",
+    "<extra_id_57>",
+    "<extra_id_58>",
+    "<extra_id_59>",
+    "<extra_id_60>",
+    "<extra_id_61>",
+    "<extra_id_62>",
+    "<extra_id_63>",
+    "<extra_id_64>",
+    "<extra_id_65>",
+    "<extra_id_66>",
+    "<extra_id_67>",
+    "<extra_id_68>",
+    "<extra_id_69>",
+    "<extra_id_70>",
+    "<extra_id_71>",
+    "<extra_id_72>",
+    "<extra_id_73>",
+    "<extra_id_74>",
+    "<extra_id_75>",
+    "<extra_id_76>",
+    "<extra_id_77>",
+    "<extra_id_78>",
+    "<extra_id_79>",
+    "<extra_id_80>",
+    "<extra_id_81>",
+    "<extra_id_82>",
+    "<extra_id_83>",
+    "<extra_id_84>",
+    "<extra_id_85>",
+    "<extra_id_86>",
+    "<extra_id_87>",
+    "<extra_id_88>",
+    "<extra_id_89>",
+    "<extra_id_90>",
+    "<extra_id_91>",
+    "<extra_id_92>",
+    "<extra_id_93>",
+    "<extra_id_94>",
+    "<extra_id_95>",
+    "<extra_id_96>",
+    "<extra_id_97>",
+    "<extra_id_98>",
+    "<extra_id_99>"
+  ],
+  "eos_token": "</s>",
+  "pad_token": "<pad>",
+  "unk_token": "<unk>"
 }

spiece.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ef11ac9a22c7503492f56d48dce53be20e339b63605983e9f27d2cd0e0f3922c
-size 4427844

 version https://git-lfs.github.com/spec/v1
+oid sha256:df65ad942bf9646454eaaff5bdb08e155b6b4e105ae8e2dac96d0ebb2e455073
+size 2148928

tokenizer.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json CHANGED Viewed

@@ -1,37 +1,112 @@
 {
-  "added_tokens_decoder": {
-    "0": {
-      "content": "<unk>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "1": {
-      "content": "<s>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "2": {
-      "content": "</s>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    }
-  },
-  "additional_special_tokens": [],
   "clean_up_tokenization_spaces": true,
   "eos_token": "</s>",
-  "extra_ids": 0,
   "legacy": false,
   "model_max_length": 1000000000000000019884624838656,
-  "pad_token": "<s>",
   "sp_model_kwargs": {},
   "tokenizer_class": "T5Tokenizer",
   "unk_token": "<unk>"

 {
+  "additional_special_tokens": [
+    "<extra_id_0>",
+    "<extra_id_1>",
+    "<extra_id_2>",
+    "<extra_id_3>",
+    "<extra_id_4>",
+    "<extra_id_5>",
+    "<extra_id_6>",
+    "<extra_id_7>",
+    "<extra_id_8>",
+    "<extra_id_9>",
+    "<extra_id_10>",
+    "<extra_id_11>",
+    "<extra_id_12>",
+    "<extra_id_13>",
+    "<extra_id_14>",
+    "<extra_id_15>",
+    "<extra_id_16>",
+    "<extra_id_17>",
+    "<extra_id_18>",
+    "<extra_id_19>",
+    "<extra_id_20>",
+    "<extra_id_21>",
+    "<extra_id_22>",
+    "<extra_id_23>",
+    "<extra_id_24>",
+    "<extra_id_25>",
+    "<extra_id_26>",
+    "<extra_id_27>",
+    "<extra_id_28>",
+    "<extra_id_29>",
+    "<extra_id_30>",
+    "<extra_id_31>",
+    "<extra_id_32>",
+    "<extra_id_33>",
+    "<extra_id_34>",
+    "<extra_id_35>",
+    "<extra_id_36>",
+    "<extra_id_37>",
+    "<extra_id_38>",
+    "<extra_id_39>",
+    "<extra_id_40>",
+    "<extra_id_41>",
+    "<extra_id_42>",
+    "<extra_id_43>",
+    "<extra_id_44>",
+    "<extra_id_45>",
+    "<extra_id_46>",
+    "<extra_id_47>",
+    "<extra_id_48>",
+    "<extra_id_49>",
+    "<extra_id_50>",
+    "<extra_id_51>",
+    "<extra_id_52>",
+    "<extra_id_53>",
+    "<extra_id_54>",
+    "<extra_id_55>",
+    "<extra_id_56>",
+    "<extra_id_57>",
+    "<extra_id_58>",
+    "<extra_id_59>",
+    "<extra_id_60>",
+    "<extra_id_61>",
+    "<extra_id_62>",
+    "<extra_id_63>",
+    "<extra_id_64>",
+    "<extra_id_65>",
+    "<extra_id_66>",
+    "<extra_id_67>",
+    "<extra_id_68>",
+    "<extra_id_69>",
+    "<extra_id_70>",
+    "<extra_id_71>",
+    "<extra_id_72>",
+    "<extra_id_73>",
+    "<extra_id_74>",
+    "<extra_id_75>",
+    "<extra_id_76>",
+    "<extra_id_77>",
+    "<extra_id_78>",
+    "<extra_id_79>",
+    "<extra_id_80>",
+    "<extra_id_81>",
+    "<extra_id_82>",
+    "<extra_id_83>",
+    "<extra_id_84>",
+    "<extra_id_85>",
+    "<extra_id_86>",
+    "<extra_id_87>",
+    "<extra_id_88>",
+    "<extra_id_89>",
+    "<extra_id_90>",
+    "<extra_id_91>",
+    "<extra_id_92>",
+    "<extra_id_93>",
+    "<extra_id_94>",
+    "<extra_id_95>",
+    "<extra_id_96>",
+    "<extra_id_97>",
+    "<extra_id_98>",
+    "<extra_id_99>"
+  ],
   "clean_up_tokenization_spaces": true,
   "eos_token": "</s>",
+  "extra_ids": 100,
   "legacy": false,
   "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
   "sp_model_kwargs": {},
   "tokenizer_class": "T5Tokenizer",
   "unk_token": "<unk>"