Upload 9 files

Browse files

secondtokenizer

Files changed (9) hide show

config.json +28 -0
flax_model.msgpack.index.json +523 -0
generation_config.json +7 -0
merges.txt +0 -0
pytorch_model.bin.index.json +523 -0
special_tokens_map.json +1 -0
tf_model.h5.index.json +523 -0
tokenizer_config.json +1 -0
vocab.json +0 -0

config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "_remove_final_layer_norm": false,
+  "activation_dropout": 0.0,
+  "activation_function": "relu",
+  "architectures": [
+    "finomaForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 2,
+  "do_layer_norm_before": true,
+  "dropout": 0.1,
+  "eos_token_id": 2,
+  "ffn_dim": 16384,
+  "hidden_size": 4096,
+  "init_std": 0.02,
+  "layerdrop": 0.0,
+  "max_position_embeddings": 2048,
+  "model_type": "finoma",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "pad_token_id": 1,
+  "prefix": "</s>",
+  "torch_dtype": "float16",
+  "transformers_version": "4.21.0.dev0",
+  "use_cache": true,
+  "vocab_size": 50272,
+  "word_embed_proj_dim": 4096
+}

flax_model.msgpack.index.json ADDED Viewed

	@@ -0,0 +1,523 @@

+{
+  "metadata": {
+    "total_size": 13316947968
+  },
+  "weight_map": {
+    "model/decoder/embed_positions/embedding": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/embed_tokens/embedding": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/final_layer_norm/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/final_layer_norm/scale": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/0/fc1/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/0/fc1/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/0/fc2/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/0/fc2/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/0/final_layer_norm/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/0/final_layer_norm/scale": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/0/self_attn/k_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/0/self_attn/k_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/0/self_attn/out_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/0/self_attn/out_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/0/self_attn/q_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/0/self_attn/q_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/0/self_attn/v_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/0/self_attn/v_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/0/self_attn_layer_norm/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/0/self_attn_layer_norm/scale": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/1/fc1/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/1/fc1/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/1/fc2/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/1/fc2/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/1/final_layer_norm/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/1/final_layer_norm/scale": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/1/self_attn/k_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/1/self_attn/k_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/1/self_attn/out_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/1/self_attn/out_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/1/self_attn/q_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/1/self_attn/q_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/1/self_attn/v_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/1/self_attn/v_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/1/self_attn_layer_norm/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/1/self_attn_layer_norm/scale": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/10/fc1/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/10/fc1/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/10/fc2/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/10/fc2/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/10/final_layer_norm/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/10/final_layer_norm/scale": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/10/self_attn/k_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/10/self_attn/k_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/10/self_attn/out_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/10/self_attn/out_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/10/self_attn/q_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/10/self_attn/q_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/10/self_attn/v_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/10/self_attn/v_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/10/self_attn_layer_norm/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/10/self_attn_layer_norm/scale": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/11/fc1/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/11/fc1/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/11/fc2/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/11/fc2/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/11/final_layer_norm/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/11/final_layer_norm/scale": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/11/self_attn/k_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/11/self_attn/k_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/11/self_attn/out_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/11/self_attn/out_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/11/self_attn/q_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/11/self_attn/q_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/11/self_attn/v_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/11/self_attn/v_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/11/self_attn_layer_norm/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/11/self_attn_layer_norm/scale": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/12/fc1/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/12/fc1/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/12/fc2/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/12/fc2/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/12/final_layer_norm/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/12/final_layer_norm/scale": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/12/self_attn/k_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/12/self_attn/k_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/12/self_attn/out_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/12/self_attn/out_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/12/self_attn/q_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/12/self_attn/q_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/12/self_attn/v_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/12/self_attn/v_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/12/self_attn_layer_norm/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/12/self_attn_layer_norm/scale": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/13/fc1/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/13/fc1/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/13/fc2/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/13/fc2/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/13/final_layer_norm/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/13/final_layer_norm/scale": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/13/self_attn/k_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/13/self_attn/k_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/13/self_attn/out_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/13/self_attn/out_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/13/self_attn/q_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/13/self_attn/q_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/13/self_attn/v_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/13/self_attn/v_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/13/self_attn_layer_norm/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/13/self_attn_layer_norm/scale": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/14/fc1/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/14/fc1/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/14/fc2/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/14/fc2/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/14/final_layer_norm/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/14/final_layer_norm/scale": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/14/self_attn/k_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/14/self_attn/k_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/14/self_attn/out_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/14/self_attn/out_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/14/self_attn/q_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/14/self_attn/q_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/14/self_attn/v_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/14/self_attn/v_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/14/self_attn_layer_norm/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/14/self_attn_layer_norm/scale": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/15/fc1/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/15/fc1/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/15/fc2/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/15/fc2/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/15/final_layer_norm/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/15/final_layer_norm/scale": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/15/self_attn/k_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/15/self_attn/k_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/15/self_attn/out_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/15/self_attn/out_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/15/self_attn/q_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/15/self_attn/q_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/15/self_attn/v_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/15/self_attn/v_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/15/self_attn_layer_norm/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/15/self_attn_layer_norm/scale": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/16/fc1/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/16/fc1/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/16/fc2/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/16/fc2/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/16/final_layer_norm/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/16/final_layer_norm/scale": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/16/self_attn/k_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/16/self_attn/k_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/16/self_attn/out_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/16/self_attn/out_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/16/self_attn/q_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/16/self_attn/q_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/16/self_attn/v_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/16/self_attn/v_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/16/self_attn_layer_norm/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/16/self_attn_layer_norm/scale": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/17/fc1/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/17/fc1/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/17/fc2/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/17/fc2/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/17/final_layer_norm/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/17/final_layer_norm/scale": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/17/self_attn/k_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/17/self_attn/k_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/17/self_attn/out_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/17/self_attn/out_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/17/self_attn/q_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/17/self_attn/q_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/17/self_attn/v_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/17/self_attn/v_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/17/self_attn_layer_norm/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/17/self_attn_layer_norm/scale": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/18/fc1/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/18/fc1/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/18/fc2/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/18/fc2/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/18/final_layer_norm/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/18/final_layer_norm/scale": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/18/self_attn/k_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/18/self_attn/k_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/18/self_attn/out_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/18/self_attn/out_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/18/self_attn/q_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/18/self_attn/q_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/18/self_attn/v_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/18/self_attn/v_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/18/self_attn_layer_norm/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/18/self_attn_layer_norm/scale": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/19/fc1/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/19/fc1/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/19/fc2/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/19/fc2/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/19/final_layer_norm/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/19/final_layer_norm/scale": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/19/self_attn/k_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/19/self_attn/k_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/19/self_attn/out_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/19/self_attn/out_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/19/self_attn/q_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/19/self_attn/q_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/19/self_attn/v_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/19/self_attn/v_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/19/self_attn_layer_norm/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/19/self_attn_layer_norm/scale": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/2/fc1/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/2/fc1/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/2/fc2/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/2/fc2/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/2/final_layer_norm/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/2/final_layer_norm/scale": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/2/self_attn/k_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/2/self_attn/k_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/2/self_attn/out_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/2/self_attn/out_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/2/self_attn/q_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/2/self_attn/q_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/2/self_attn/v_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/2/self_attn/v_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/2/self_attn_layer_norm/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/2/self_attn_layer_norm/scale": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/20/fc1/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/20/fc1/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/20/fc2/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/20/fc2/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/20/final_layer_norm/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/20/final_layer_norm/scale": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/20/self_attn/k_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/20/self_attn/k_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/20/self_attn/out_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/20/self_attn/out_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/20/self_attn/q_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/20/self_attn/q_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/20/self_attn/v_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/20/self_attn/v_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/20/self_attn_layer_norm/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/20/self_attn_layer_norm/scale": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/21/fc1/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/21/fc1/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/21/fc2/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/21/fc2/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/21/final_layer_norm/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/21/final_layer_norm/scale": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/21/self_attn/k_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/21/self_attn/k_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/21/self_attn/out_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/21/self_attn/out_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/21/self_attn/q_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/21/self_attn/q_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/21/self_attn/v_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/21/self_attn/v_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/21/self_attn_layer_norm/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/21/self_attn_layer_norm/scale": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/22/fc1/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/22/fc1/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/22/fc2/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/22/fc2/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/22/final_layer_norm/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/22/final_layer_norm/scale": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/22/self_attn/k_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/22/self_attn/k_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/22/self_attn/out_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/22/self_attn/out_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/22/self_attn/q_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/22/self_attn/q_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/22/self_attn/v_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/22/self_attn/v_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/22/self_attn_layer_norm/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/22/self_attn_layer_norm/scale": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/23/fc1/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/23/fc1/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/23/fc2/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/23/fc2/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/23/final_layer_norm/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/23/final_layer_norm/scale": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/23/self_attn/k_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/23/self_attn/k_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/23/self_attn/out_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/23/self_attn/out_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/23/self_attn/q_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/23/self_attn/q_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/23/self_attn/v_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/23/self_attn/v_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/23/self_attn_layer_norm/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/23/self_attn_layer_norm/scale": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/24/fc1/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/24/fc1/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/24/fc2/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/24/fc2/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/24/final_layer_norm/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/24/final_layer_norm/scale": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/24/self_attn/k_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/24/self_attn/k_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/24/self_attn/out_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/24/self_attn/out_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/24/self_attn/q_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/24/self_attn/q_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/24/self_attn/v_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/24/self_attn/v_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/24/self_attn_layer_norm/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/24/self_attn_layer_norm/scale": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/25/fc1/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/25/fc1/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/25/fc2/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/25/fc2/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/25/final_layer_norm/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/25/final_layer_norm/scale": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/25/self_attn/k_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/25/self_attn/k_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/25/self_attn/out_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/25/self_attn/out_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/25/self_attn/q_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/25/self_attn/q_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/25/self_attn/v_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/25/self_attn/v_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/25/self_attn_layer_norm/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/25/self_attn_layer_norm/scale": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/26/fc1/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/26/fc1/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/26/fc2/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/26/fc2/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/26/final_layer_norm/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/26/final_layer_norm/scale": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/26/self_attn/k_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/26/self_attn/k_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/26/self_attn/out_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/26/self_attn/out_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/26/self_attn/q_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/26/self_attn/q_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/26/self_attn/v_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/26/self_attn/v_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/26/self_attn_layer_norm/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/26/self_attn_layer_norm/scale": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/27/fc1/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/27/fc1/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/27/fc2/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/27/fc2/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/27/final_layer_norm/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/27/final_layer_norm/scale": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/27/self_attn/k_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/27/self_attn/k_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/27/self_attn/out_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/27/self_attn/out_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/27/self_attn/q_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/27/self_attn/q_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/27/self_attn/v_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/27/self_attn/v_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/27/self_attn_layer_norm/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/27/self_attn_layer_norm/scale": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/28/fc1/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/28/fc1/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/28/fc2/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/28/fc2/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/28/final_layer_norm/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/28/final_layer_norm/scale": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/28/self_attn/k_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/28/self_attn/k_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/28/self_attn/out_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/28/self_attn/out_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/28/self_attn/q_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/28/self_attn/q_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/28/self_attn/v_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/28/self_attn/v_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/28/self_attn_layer_norm/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/28/self_attn_layer_norm/scale": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/29/fc1/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/29/fc1/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/29/fc2/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/29/fc2/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/29/final_layer_norm/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/29/final_layer_norm/scale": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/29/self_attn/k_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/29/self_attn/k_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/29/self_attn/out_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/29/self_attn/out_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/29/self_attn/q_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/29/self_attn/q_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/29/self_attn/v_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/29/self_attn/v_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/29/self_attn_layer_norm/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/29/self_attn_layer_norm/scale": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/3/fc1/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/3/fc1/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/3/fc2/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/3/fc2/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/3/final_layer_norm/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/3/final_layer_norm/scale": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/3/self_attn/k_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/3/self_attn/k_proj/kernel": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/3/self_attn/out_proj/bias": "flax_model-00001-of-00002.msgpack",
+    "model/decoder/layers/3/self_attn/out_proj/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/3/self_attn/q_proj/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/3/self_attn/q_proj/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/3/self_attn/v_proj/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/3/self_attn/v_proj/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/3/self_attn_layer_norm/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/3/self_attn_layer_norm/scale": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/30/fc1/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/30/fc1/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/30/fc2/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/30/fc2/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/30/final_layer_norm/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/30/final_layer_norm/scale": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/30/self_attn/k_proj/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/30/self_attn/k_proj/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/30/self_attn/out_proj/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/30/self_attn/out_proj/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/30/self_attn/q_proj/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/30/self_attn/q_proj/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/30/self_attn/v_proj/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/30/self_attn/v_proj/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/30/self_attn_layer_norm/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/30/self_attn_layer_norm/scale": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/31/fc1/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/31/fc1/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/31/fc2/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/31/fc2/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/31/final_layer_norm/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/31/final_layer_norm/scale": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/31/self_attn/k_proj/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/31/self_attn/k_proj/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/31/self_attn/out_proj/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/31/self_attn/out_proj/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/31/self_attn/q_proj/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/31/self_attn/q_proj/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/31/self_attn/v_proj/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/31/self_attn/v_proj/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/31/self_attn_layer_norm/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/31/self_attn_layer_norm/scale": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/4/fc1/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/4/fc1/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/4/fc2/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/4/fc2/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/4/final_layer_norm/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/4/final_layer_norm/scale": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/4/self_attn/k_proj/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/4/self_attn/k_proj/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/4/self_attn/out_proj/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/4/self_attn/out_proj/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/4/self_attn/q_proj/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/4/self_attn/q_proj/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/4/self_attn/v_proj/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/4/self_attn/v_proj/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/4/self_attn_layer_norm/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/4/self_attn_layer_norm/scale": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/5/fc1/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/5/fc1/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/5/fc2/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/5/fc2/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/5/final_layer_norm/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/5/final_layer_norm/scale": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/5/self_attn/k_proj/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/5/self_attn/k_proj/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/5/self_attn/out_proj/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/5/self_attn/out_proj/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/5/self_attn/q_proj/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/5/self_attn/q_proj/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/5/self_attn/v_proj/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/5/self_attn/v_proj/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/5/self_attn_layer_norm/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/5/self_attn_layer_norm/scale": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/6/fc1/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/6/fc1/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/6/fc2/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/6/fc2/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/6/final_layer_norm/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/6/final_layer_norm/scale": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/6/self_attn/k_proj/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/6/self_attn/k_proj/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/6/self_attn/out_proj/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/6/self_attn/out_proj/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/6/self_attn/q_proj/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/6/self_attn/q_proj/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/6/self_attn/v_proj/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/6/self_attn/v_proj/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/6/self_attn_layer_norm/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/6/self_attn_layer_norm/scale": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/7/fc1/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/7/fc1/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/7/fc2/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/7/fc2/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/7/final_layer_norm/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/7/final_layer_norm/scale": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/7/self_attn/k_proj/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/7/self_attn/k_proj/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/7/self_attn/out_proj/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/7/self_attn/out_proj/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/7/self_attn/q_proj/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/7/self_attn/q_proj/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/7/self_attn/v_proj/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/7/self_attn/v_proj/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/7/self_attn_layer_norm/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/7/self_attn_layer_norm/scale": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/8/fc1/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/8/fc1/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/8/fc2/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/8/fc2/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/8/final_layer_norm/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/8/final_layer_norm/scale": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/8/self_attn/k_proj/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/8/self_attn/k_proj/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/8/self_attn/out_proj/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/8/self_attn/out_proj/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/8/self_attn/q_proj/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/8/self_attn/q_proj/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/8/self_attn/v_proj/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/8/self_attn/v_proj/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/8/self_attn_layer_norm/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/8/self_attn_layer_norm/scale": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/9/fc1/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/9/fc1/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/9/fc2/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/9/fc2/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/9/final_layer_norm/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/9/final_layer_norm/scale": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/9/self_attn/k_proj/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/9/self_attn/k_proj/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/9/self_attn/out_proj/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/9/self_attn/out_proj/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/9/self_attn/q_proj/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/9/self_attn/q_proj/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/9/self_attn/v_proj/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/9/self_attn/v_proj/kernel": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/9/self_attn_layer_norm/bias": "flax_model-00002-of-00002.msgpack",
+    "model/decoder/layers/9/self_attn_layer_norm/scale": "flax_model-00002-of-00002.msgpack"
+  }
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 2,
+  "eos_token_id": 2,
+  "pad_token_id": 1,
+  "transformers_version": "4.27.0.dev0"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

pytorch_model.bin.index.json ADDED Viewed

	@@ -0,0 +1,523 @@

+{
+  "metadata": {
+    "total_size": 13316947968
+  },
+  "weight_map": {
+    "decoder.embed_positions.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.embed_tokens.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.final_layer_norm.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.final_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.0.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.0.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.0.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.0.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.0.final_layer_norm.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.0.final_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.0.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.0.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.0.self_attn.out_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.0.self_attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.0.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.0.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.0.self_attn_layer_norm.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.0.self_attn_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.1.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.1.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.1.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.1.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.1.final_layer_norm.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.1.final_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.1.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.1.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.1.self_attn.out_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.1.self_attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.1.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.1.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.1.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.1.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.1.self_attn_layer_norm.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.1.self_attn_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.10.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.10.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.10.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.10.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.10.final_layer_norm.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.10.final_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.10.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.10.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.10.self_attn.out_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.10.self_attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.10.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.10.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.10.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.10.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.10.self_attn_layer_norm.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.10.self_attn_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.11.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.11.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.11.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.11.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.11.final_layer_norm.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.11.final_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.11.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.11.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.11.self_attn.out_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.11.self_attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.11.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.11.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.11.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.11.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.11.self_attn_layer_norm.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.11.self_attn_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.12.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.12.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.12.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.12.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.12.final_layer_norm.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.12.final_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.12.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.12.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.12.self_attn.out_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.12.self_attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.12.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.12.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.12.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.12.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.12.self_attn_layer_norm.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.12.self_attn_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.13.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.13.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.13.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.13.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.13.final_layer_norm.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.13.final_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.13.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.13.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.13.self_attn.out_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.13.self_attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.13.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.13.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.13.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.13.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.13.self_attn_layer_norm.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.13.self_attn_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.14.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.14.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.14.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.14.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.14.final_layer_norm.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.14.final_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.14.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.14.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.14.self_attn.out_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.14.self_attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.14.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.14.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.14.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.14.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.14.self_attn_layer_norm.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.14.self_attn_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.15.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.15.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.15.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.15.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.15.final_layer_norm.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.15.final_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.15.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.15.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.15.self_attn.out_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.15.self_attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.15.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.15.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.15.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.15.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.15.self_attn_layer_norm.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.15.self_attn_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.16.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.16.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.16.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.16.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.16.final_layer_norm.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.16.final_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.16.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.16.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.16.self_attn.out_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.16.self_attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.16.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.16.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.16.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.16.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.16.self_attn_layer_norm.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.16.self_attn_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.17.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.17.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.17.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.17.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.17.final_layer_norm.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.17.final_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.17.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.17.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.17.self_attn.out_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.17.self_attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.17.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.17.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.17.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.17.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.17.self_attn_layer_norm.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.17.self_attn_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.18.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.18.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.18.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.18.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.18.final_layer_norm.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.18.final_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.18.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.18.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.18.self_attn.out_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.18.self_attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.18.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.18.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.18.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.18.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.18.self_attn_layer_norm.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.18.self_attn_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.19.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.19.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.19.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.19.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.19.final_layer_norm.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.19.final_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.19.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.19.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.19.self_attn.out_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.19.self_attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.19.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.19.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.19.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.19.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.19.self_attn_layer_norm.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.19.self_attn_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.2.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.2.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.2.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.2.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.2.final_layer_norm.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.2.final_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.2.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.2.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.2.self_attn.out_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.2.self_attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.2.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.2.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.2.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.2.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.2.self_attn_layer_norm.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.2.self_attn_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.20.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.20.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.20.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.20.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.20.final_layer_norm.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.20.final_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.20.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.20.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.20.self_attn.out_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.20.self_attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.20.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.20.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.20.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.20.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.20.self_attn_layer_norm.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.20.self_attn_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.21.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.21.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.21.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.21.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.21.final_layer_norm.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.21.final_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.21.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.21.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.21.self_attn.out_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.21.self_attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.21.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.21.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.21.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.21.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.21.self_attn_layer_norm.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.21.self_attn_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.22.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.22.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.22.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.22.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.22.final_layer_norm.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.22.final_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.22.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.22.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.22.self_attn.out_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.22.self_attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.22.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.22.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.22.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.22.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.22.self_attn_layer_norm.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.22.self_attn_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.23.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.23.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.23.fc2.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.23.fc2.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.23.final_layer_norm.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.23.final_layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.23.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.23.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.23.self_attn.out_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.23.self_attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.23.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.23.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.23.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.23.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.23.self_attn_layer_norm.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.23.self_attn_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.24.fc1.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.24.fc1.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.24.fc2.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.24.fc2.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.24.final_layer_norm.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.24.final_layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.24.self_attn.k_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.24.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.24.self_attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.24.self_attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.24.self_attn.q_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.24.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.24.self_attn.v_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.24.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.24.self_attn_layer_norm.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.24.self_attn_layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.25.fc1.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.25.fc1.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.25.fc2.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.25.fc2.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.25.final_layer_norm.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.25.final_layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.25.self_attn.k_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.25.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.25.self_attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.25.self_attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.25.self_attn.q_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.25.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.25.self_attn.v_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.25.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.25.self_attn_layer_norm.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.25.self_attn_layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.26.fc1.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.26.fc1.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.26.fc2.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.26.fc2.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.26.final_layer_norm.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.26.final_layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.26.self_attn.k_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.26.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.26.self_attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.26.self_attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.26.self_attn.q_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.26.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.26.self_attn.v_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.26.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.26.self_attn_layer_norm.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.26.self_attn_layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.27.fc1.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.27.fc1.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.27.fc2.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.27.fc2.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.27.final_layer_norm.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.27.final_layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.27.self_attn.k_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.27.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.27.self_attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.27.self_attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.27.self_attn.q_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.27.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.27.self_attn.v_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.27.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.27.self_attn_layer_norm.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.27.self_attn_layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.28.fc1.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.28.fc1.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.28.fc2.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.28.fc2.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.28.final_layer_norm.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.28.final_layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.28.self_attn.k_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.28.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.28.self_attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.28.self_attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.28.self_attn.q_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.28.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.28.self_attn.v_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.28.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.28.self_attn_layer_norm.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.28.self_attn_layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.29.fc1.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.29.fc1.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.29.fc2.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.29.fc2.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.29.final_layer_norm.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.29.final_layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.29.self_attn.k_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.29.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.29.self_attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.29.self_attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.29.self_attn.q_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.29.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.29.self_attn.v_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.29.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.29.self_attn_layer_norm.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.29.self_attn_layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.3.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.3.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.3.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.3.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.3.final_layer_norm.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.3.final_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.3.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.3.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.3.self_attn.out_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.3.self_attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.3.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.3.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.3.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.3.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.3.self_attn_layer_norm.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.3.self_attn_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.30.fc1.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.30.fc1.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.30.fc2.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.30.fc2.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.30.final_layer_norm.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.30.final_layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.30.self_attn.k_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.30.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.30.self_attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.30.self_attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.30.self_attn.q_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.30.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.30.self_attn.v_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.30.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.30.self_attn_layer_norm.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.30.self_attn_layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.31.fc1.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.31.fc1.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.31.fc2.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.31.fc2.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.31.final_layer_norm.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.31.final_layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.31.self_attn.k_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.31.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.31.self_attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.31.self_attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.31.self_attn.q_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.31.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.31.self_attn.v_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.31.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.31.self_attn_layer_norm.bias": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.31.self_attn_layer_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "decoder.layers.4.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.4.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.4.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.4.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.4.final_layer_norm.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.4.final_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.4.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.4.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.4.self_attn.out_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.4.self_attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.4.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.4.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.4.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.4.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.4.self_attn_layer_norm.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.4.self_attn_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.5.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.5.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.5.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.5.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.5.final_layer_norm.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.5.final_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.5.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.5.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.5.self_attn.out_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.5.self_attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.5.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.5.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.5.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.5.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.5.self_attn_layer_norm.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.5.self_attn_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.6.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.6.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.6.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.6.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.6.final_layer_norm.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.6.final_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.6.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.6.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.6.self_attn.out_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.6.self_attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.6.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.6.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.6.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.6.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.6.self_attn_layer_norm.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.6.self_attn_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.7.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.7.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.7.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.7.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.7.final_layer_norm.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.7.final_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.7.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.7.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.7.self_attn.out_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.7.self_attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.7.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.7.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.7.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.7.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.7.self_attn_layer_norm.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.7.self_attn_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.8.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.8.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.8.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.8.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.8.final_layer_norm.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.8.final_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.8.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.8.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.8.self_attn.out_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.8.self_attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.8.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.8.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.8.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.8.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.8.self_attn_layer_norm.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.8.self_attn_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.9.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.9.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.9.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.9.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.9.final_layer_norm.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.9.final_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.9.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.9.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.9.self_attn.out_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.9.self_attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.9.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.9.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.9.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.9.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.9.self_attn_layer_norm.bias": "pytorch_model-00001-of-00002.bin",
+    "decoder.layers.9.self_attn_layer_norm.weight": "pytorch_model-00001-of-00002.bin"
+  }
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"bos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "eos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "unk_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "pad_token": {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}}

tf_model.h5.index.json ADDED Viewed

	@@ -0,0 +1,523 @@

+{
+  "metadata": {
+    "total_size": 13316947968
+  },
+  "weight_map": {
+    "tfopt_for_causal_lm/model/decoder/embed_positions/weight:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/embed_tokens/weight:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/final_layer_norm/beta:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/final_layer_norm/gamma:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.0/fc1/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.0/fc1/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.0/fc2/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.0/fc2/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.0/final_layer_norm/beta:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.0/final_layer_norm/gamma:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.0/self_attn/k_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.0/self_attn/k_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.0/self_attn/out_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.0/self_attn/out_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.0/self_attn/q_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.0/self_attn/q_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.0/self_attn/v_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.0/self_attn/v_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.0/self_attn_layer_norm/beta:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.0/self_attn_layer_norm/gamma:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.1/fc1/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.1/fc1/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.1/fc2/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.1/fc2/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.1/final_layer_norm/beta:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.1/final_layer_norm/gamma:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.1/self_attn/k_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.1/self_attn/k_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.1/self_attn/out_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.1/self_attn/out_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.1/self_attn/q_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.1/self_attn/q_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.1/self_attn/v_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.1/self_attn/v_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.1/self_attn_layer_norm/beta:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.1/self_attn_layer_norm/gamma:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.10/fc1/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.10/fc1/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.10/fc2/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.10/fc2/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.10/final_layer_norm/beta:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.10/final_layer_norm/gamma:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.10/self_attn/k_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.10/self_attn/k_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.10/self_attn/out_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.10/self_attn/out_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.10/self_attn/q_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.10/self_attn/q_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.10/self_attn/v_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.10/self_attn/v_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.10/self_attn_layer_norm/beta:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.10/self_attn_layer_norm/gamma:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.11/fc1/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.11/fc1/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.11/fc2/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.11/fc2/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.11/final_layer_norm/beta:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.11/final_layer_norm/gamma:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.11/self_attn/k_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.11/self_attn/k_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.11/self_attn/out_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.11/self_attn/out_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.11/self_attn/q_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.11/self_attn/q_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.11/self_attn/v_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.11/self_attn/v_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.11/self_attn_layer_norm/beta:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.11/self_attn_layer_norm/gamma:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.12/fc1/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.12/fc1/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.12/fc2/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.12/fc2/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.12/final_layer_norm/beta:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.12/final_layer_norm/gamma:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.12/self_attn/k_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.12/self_attn/k_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.12/self_attn/out_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.12/self_attn/out_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.12/self_attn/q_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.12/self_attn/q_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.12/self_attn/v_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.12/self_attn/v_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.12/self_attn_layer_norm/beta:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.12/self_attn_layer_norm/gamma:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.13/fc1/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.13/fc1/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.13/fc2/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.13/fc2/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.13/final_layer_norm/beta:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.13/final_layer_norm/gamma:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.13/self_attn/k_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.13/self_attn/k_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.13/self_attn/out_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.13/self_attn/out_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.13/self_attn/q_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.13/self_attn/q_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.13/self_attn/v_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.13/self_attn/v_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.13/self_attn_layer_norm/beta:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.13/self_attn_layer_norm/gamma:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.14/fc1/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.14/fc1/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.14/fc2/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.14/fc2/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.14/final_layer_norm/beta:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.14/final_layer_norm/gamma:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.14/self_attn/k_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.14/self_attn/k_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.14/self_attn/out_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.14/self_attn/out_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.14/self_attn/q_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.14/self_attn/q_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.14/self_attn/v_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.14/self_attn/v_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.14/self_attn_layer_norm/beta:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.14/self_attn_layer_norm/gamma:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.15/fc1/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.15/fc1/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.15/fc2/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.15/fc2/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.15/final_layer_norm/beta:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.15/final_layer_norm/gamma:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.15/self_attn/k_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.15/self_attn/k_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.15/self_attn/out_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.15/self_attn/out_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.15/self_attn/q_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.15/self_attn/q_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.15/self_attn/v_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.15/self_attn/v_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.15/self_attn_layer_norm/beta:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.15/self_attn_layer_norm/gamma:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.16/fc1/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.16/fc1/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.16/fc2/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.16/fc2/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.16/final_layer_norm/beta:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.16/final_layer_norm/gamma:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.16/self_attn/k_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.16/self_attn/k_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.16/self_attn/out_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.16/self_attn/out_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.16/self_attn/q_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.16/self_attn/q_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.16/self_attn/v_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.16/self_attn/v_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.16/self_attn_layer_norm/beta:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.16/self_attn_layer_norm/gamma:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.17/fc1/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.17/fc1/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.17/fc2/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.17/fc2/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.17/final_layer_norm/beta:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.17/final_layer_norm/gamma:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.17/self_attn/k_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.17/self_attn/k_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.17/self_attn/out_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.17/self_attn/out_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.17/self_attn/q_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.17/self_attn/q_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.17/self_attn/v_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.17/self_attn/v_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.17/self_attn_layer_norm/beta:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.17/self_attn_layer_norm/gamma:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.18/fc1/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.18/fc1/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.18/fc2/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.18/fc2/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.18/final_layer_norm/beta:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.18/final_layer_norm/gamma:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.18/self_attn/k_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.18/self_attn/k_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.18/self_attn/out_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.18/self_attn/out_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.18/self_attn/q_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.18/self_attn/q_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.18/self_attn/v_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.18/self_attn/v_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.18/self_attn_layer_norm/beta:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.18/self_attn_layer_norm/gamma:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.19/fc1/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.19/fc1/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.19/fc2/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.19/fc2/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.19/final_layer_norm/beta:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.19/final_layer_norm/gamma:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.19/self_attn/k_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.19/self_attn/k_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.19/self_attn/out_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.19/self_attn/out_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.19/self_attn/q_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.19/self_attn/q_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.19/self_attn/v_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.19/self_attn/v_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.19/self_attn_layer_norm/beta:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.19/self_attn_layer_norm/gamma:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.2/fc1/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.2/fc1/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.2/fc2/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.2/fc2/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.2/final_layer_norm/beta:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.2/final_layer_norm/gamma:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.2/self_attn/k_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.2/self_attn/k_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.2/self_attn/out_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.2/self_attn/out_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.2/self_attn/q_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.2/self_attn/q_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.2/self_attn/v_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.2/self_attn/v_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.2/self_attn_layer_norm/beta:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.2/self_attn_layer_norm/gamma:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.20/fc1/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.20/fc1/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.20/fc2/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.20/fc2/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.20/final_layer_norm/beta:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.20/final_layer_norm/gamma:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.20/self_attn/k_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.20/self_attn/k_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.20/self_attn/out_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.20/self_attn/out_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.20/self_attn/q_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.20/self_attn/q_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.20/self_attn/v_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.20/self_attn/v_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.20/self_attn_layer_norm/beta:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.20/self_attn_layer_norm/gamma:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.21/fc1/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.21/fc1/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.21/fc2/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.21/fc2/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.21/final_layer_norm/beta:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.21/final_layer_norm/gamma:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.21/self_attn/k_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.21/self_attn/k_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.21/self_attn/out_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.21/self_attn/out_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.21/self_attn/q_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.21/self_attn/q_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.21/self_attn/v_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.21/self_attn/v_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.21/self_attn_layer_norm/beta:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.21/self_attn_layer_norm/gamma:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.22/fc1/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.22/fc1/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.22/fc2/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.22/fc2/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.22/final_layer_norm/beta:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.22/final_layer_norm/gamma:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.22/self_attn/k_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.22/self_attn/k_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.22/self_attn/out_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.22/self_attn/out_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.22/self_attn/q_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.22/self_attn/q_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.22/self_attn/v_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.22/self_attn/v_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.22/self_attn_layer_norm/beta:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.22/self_attn_layer_norm/gamma:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.23/fc1/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.23/fc1/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.23/fc2/bias:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.23/fc2/kernel:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.23/final_layer_norm/beta:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.23/final_layer_norm/gamma:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.23/self_attn/k_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.23/self_attn/k_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.23/self_attn/out_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.23/self_attn/out_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.23/self_attn/q_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.23/self_attn/q_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.23/self_attn/v_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.23/self_attn/v_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.23/self_attn_layer_norm/beta:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.23/self_attn_layer_norm/gamma:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.24/fc1/bias:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.24/fc1/kernel:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.24/fc2/bias:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.24/fc2/kernel:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.24/final_layer_norm/beta:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.24/final_layer_norm/gamma:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.24/self_attn/k_proj/bias:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.24/self_attn/k_proj/kernel:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.24/self_attn/out_proj/bias:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.24/self_attn/out_proj/kernel:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.24/self_attn/q_proj/bias:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.24/self_attn/q_proj/kernel:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.24/self_attn/v_proj/bias:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.24/self_attn/v_proj/kernel:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.24/self_attn_layer_norm/beta:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.24/self_attn_layer_norm/gamma:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.25/fc1/bias:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.25/fc1/kernel:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.25/fc2/bias:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.25/fc2/kernel:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.25/final_layer_norm/beta:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.25/final_layer_norm/gamma:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.25/self_attn/k_proj/bias:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.25/self_attn/k_proj/kernel:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.25/self_attn/out_proj/bias:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.25/self_attn/out_proj/kernel:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.25/self_attn/q_proj/bias:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.25/self_attn/q_proj/kernel:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.25/self_attn/v_proj/bias:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.25/self_attn/v_proj/kernel:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.25/self_attn_layer_norm/beta:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.25/self_attn_layer_norm/gamma:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.26/fc1/bias:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.26/fc1/kernel:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.26/fc2/bias:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.26/fc2/kernel:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.26/final_layer_norm/beta:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.26/final_layer_norm/gamma:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.26/self_attn/k_proj/bias:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.26/self_attn/k_proj/kernel:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.26/self_attn/out_proj/bias:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.26/self_attn/out_proj/kernel:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.26/self_attn/q_proj/bias:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.26/self_attn/q_proj/kernel:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.26/self_attn/v_proj/bias:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.26/self_attn/v_proj/kernel:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.26/self_attn_layer_norm/beta:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.26/self_attn_layer_norm/gamma:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.27/fc1/bias:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.27/fc1/kernel:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.27/fc2/bias:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.27/fc2/kernel:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.27/final_layer_norm/beta:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.27/final_layer_norm/gamma:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.27/self_attn/k_proj/bias:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.27/self_attn/k_proj/kernel:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.27/self_attn/out_proj/bias:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.27/self_attn/out_proj/kernel:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.27/self_attn/q_proj/bias:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.27/self_attn/q_proj/kernel:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.27/self_attn/v_proj/bias:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.27/self_attn/v_proj/kernel:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.27/self_attn_layer_norm/beta:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.27/self_attn_layer_norm/gamma:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.28/fc1/bias:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.28/fc1/kernel:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.28/fc2/bias:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.28/fc2/kernel:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.28/final_layer_norm/beta:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.28/final_layer_norm/gamma:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.28/self_attn/k_proj/bias:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.28/self_attn/k_proj/kernel:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.28/self_attn/out_proj/bias:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.28/self_attn/out_proj/kernel:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.28/self_attn/q_proj/bias:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.28/self_attn/q_proj/kernel:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.28/self_attn/v_proj/bias:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.28/self_attn/v_proj/kernel:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.28/self_attn_layer_norm/beta:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.28/self_attn_layer_norm/gamma:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.29/fc1/bias:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.29/fc1/kernel:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.29/fc2/bias:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.29/fc2/kernel:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.29/final_layer_norm/beta:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.29/final_layer_norm/gamma:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.29/self_attn/k_proj/bias:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.29/self_attn/k_proj/kernel:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.29/self_attn/out_proj/bias:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.29/self_attn/out_proj/kernel:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.29/self_attn/q_proj/bias:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.29/self_attn/q_proj/kernel:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.29/self_attn/v_proj/bias:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.29/self_attn/v_proj/kernel:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.29/self_attn_layer_norm/beta:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.29/self_attn_layer_norm/gamma:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.3/fc1/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.3/fc1/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.3/fc2/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.3/fc2/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.3/final_layer_norm/beta:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.3/final_layer_norm/gamma:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.3/self_attn/k_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.3/self_attn/k_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.3/self_attn/out_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.3/self_attn/out_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.3/self_attn/q_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.3/self_attn/q_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.3/self_attn/v_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.3/self_attn/v_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.3/self_attn_layer_norm/beta:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.3/self_attn_layer_norm/gamma:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.30/fc1/bias:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.30/fc1/kernel:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.30/fc2/bias:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.30/fc2/kernel:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.30/final_layer_norm/beta:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.30/final_layer_norm/gamma:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.30/self_attn/k_proj/bias:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.30/self_attn/k_proj/kernel:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.30/self_attn/out_proj/bias:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.30/self_attn/out_proj/kernel:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.30/self_attn/q_proj/bias:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.30/self_attn/q_proj/kernel:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.30/self_attn/v_proj/bias:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.30/self_attn/v_proj/kernel:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.30/self_attn_layer_norm/beta:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.30/self_attn_layer_norm/gamma:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.31/fc1/bias:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.31/fc1/kernel:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.31/fc2/bias:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.31/fc2/kernel:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.31/final_layer_norm/beta:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.31/final_layer_norm/gamma:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.31/self_attn/k_proj/bias:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.31/self_attn/k_proj/kernel:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.31/self_attn/out_proj/bias:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.31/self_attn/out_proj/kernel:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.31/self_attn/q_proj/bias:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.31/self_attn/q_proj/kernel:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.31/self_attn/v_proj/bias:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.31/self_attn/v_proj/kernel:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.31/self_attn_layer_norm/beta:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.31/self_attn_layer_norm/gamma:0": "tf_model-00002-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.4/fc1/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.4/fc1/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.4/fc2/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.4/fc2/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.4/final_layer_norm/beta:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.4/final_layer_norm/gamma:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.4/self_attn/k_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.4/self_attn/k_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.4/self_attn/out_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.4/self_attn/out_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.4/self_attn/q_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.4/self_attn/q_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.4/self_attn/v_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.4/self_attn/v_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.4/self_attn_layer_norm/beta:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.4/self_attn_layer_norm/gamma:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.5/fc1/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.5/fc1/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.5/fc2/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.5/fc2/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.5/final_layer_norm/beta:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.5/final_layer_norm/gamma:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.5/self_attn/k_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.5/self_attn/k_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.5/self_attn/out_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.5/self_attn/out_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.5/self_attn/q_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.5/self_attn/q_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.5/self_attn/v_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.5/self_attn/v_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.5/self_attn_layer_norm/beta:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.5/self_attn_layer_norm/gamma:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.6/fc1/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.6/fc1/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.6/fc2/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.6/fc2/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.6/final_layer_norm/beta:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.6/final_layer_norm/gamma:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.6/self_attn/k_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.6/self_attn/k_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.6/self_attn/out_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.6/self_attn/out_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.6/self_attn/q_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.6/self_attn/q_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.6/self_attn/v_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.6/self_attn/v_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.6/self_attn_layer_norm/beta:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.6/self_attn_layer_norm/gamma:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.7/fc1/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.7/fc1/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.7/fc2/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.7/fc2/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.7/final_layer_norm/beta:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.7/final_layer_norm/gamma:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.7/self_attn/k_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.7/self_attn/k_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.7/self_attn/out_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.7/self_attn/out_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.7/self_attn/q_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.7/self_attn/q_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.7/self_attn/v_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.7/self_attn/v_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.7/self_attn_layer_norm/beta:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.7/self_attn_layer_norm/gamma:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.8/fc1/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.8/fc1/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.8/fc2/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.8/fc2/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.8/final_layer_norm/beta:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.8/final_layer_norm/gamma:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.8/self_attn/k_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.8/self_attn/k_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.8/self_attn/out_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.8/self_attn/out_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.8/self_attn/q_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.8/self_attn/q_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.8/self_attn/v_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.8/self_attn/v_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.8/self_attn_layer_norm/beta:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.8/self_attn_layer_norm/gamma:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.9/fc1/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.9/fc1/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.9/fc2/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.9/fc2/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.9/final_layer_norm/beta:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.9/final_layer_norm/gamma:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.9/self_attn/k_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.9/self_attn/k_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.9/self_attn/out_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.9/self_attn/out_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.9/self_attn/q_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.9/self_attn/q_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.9/self_attn/v_proj/bias:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.9/self_attn/v_proj/kernel:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.9/self_attn_layer_norm/beta:0": "tf_model-00001-of-00002.h5",
+    "tfopt_for_causal_lm/model/decoder/layers.9/self_attn_layer_norm/gamma:0": "tf_model-00001-of-00002.h5"
+  }
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"errors": "replace", "unk_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "bos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "eos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "pad_token": {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "add_prefix_space": false, "add_bos_token": true, "special_tokens_map_file": null, "name_or_path": "sayril007/finoma-tokeinzer-main"}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff