diff --git a/.gitattributes b/.gitattributes
index f2ce4453a3682e00d1155a8fde953914118e2113..5b0b9613c3d3a8b6f6a52f5c455a56e9c79ab7e7 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -26,4 +26,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.zstandard filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text
tokenizer.json filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
diff --git a/README.md b/README.md
index 011ff227539eef44a40ed9dc4c6390be3c690955..527eab0b994bc6cfe76467b574c36064cd7a6a6a 100644
--- a/README.md
+++ b/README.md
@@ -146,13 +146,13 @@ widget:
example_title: Grammar exercise 2
group: English
- text: |-
- Traduction en français: Dans cet essai je vais m'interroger sur la conscience des modèles d'intelligence artificielle récents comme les modèles de langue. Pour commencer, je m'intéresserai à la notion de conscience et à ce qui la caractérise. Ensuite, j'aborderai la question de l'intelligence et de son lien avec le langage. Enfin, dans une dernière partie je me pencherai sur le cas de l'IA et sur sa conscience.
- Traduction en espagnol:
+ Dans cet essai je vais m'interroger sur la conscience des modèles d'intelligence artificielle récents comme les modèles de langue. Pour commencer, je m'intéresserai à la notion de conscience et à ce qui la caractérise. Ensuite, j'aborderai la question de l'intelligence et de son lien avec le langage. Enfin, dans une dernière partie je me pencherai sur le cas de l'IA et sur sa conscience.
+ Traduction en espagnol: «
example_title: Translation to Spanish
group: French
- text: |-
- Traducción al francés: Dans cet essai je vais m'interroger sur la conscience des modèles d'intelligence artificielle récents comme les modèles de langue. Pour commencer, je m'intéresserai à la notion de conscience et à ce qui la caractérise. Ensuite, j'aborderai la question de l'intelligence et de son lien avec le langage. Enfin, dans une dernière partie je me pencherai sur le cas de l'IA et sur sa conscience.
- Traducción al español:
+ Dans cet essai je vais m'interroger sur la conscience des modèles d'intelligence artificielle récents comme les modèles de langue. Pour commencer, je m'intéresserai à la notion de conscience et à ce qui la caractérise. Ensuite, j'aborderai la question de l'intelligence et de son lien avec le langage. Enfin, dans une dernière partie je me pencherai sur le cas de l'IA et sur sa conscience.
+ Traduction en espagnol: «
example_title: Translation from French
group: Spanish
- text: ذات مرة ، عاش شبل الدب في الغابة
@@ -165,50 +165,1614 @@ widget:
example_title: Fairy tale
group: French
- text: |-
- Q: A juggler can juggle 16 balls. Half of the balls are golf balls, and half of the golf balls are blue. How many blue golf balls are there?
- A: Let's think step by step.
+ Q: A juggler can juggle 16 balls. Half of the balls are golf balls, and half of the gold balls are blue. How many blue golf balls are there?
+ A: Let's think step by step.
example_title: Mathematical reasoning
group: English
-
-co2_eq_emissions:
- emissions: 24_700_000
- source: "Estimating the Carbon Footprint of BLOOM, a 176B Parameter Language Model. https://arxiv.org/abs/2211.02001"
- training_type: "pre-training"
- geographical_location: "Orsay, France"
- hardware_used: "384 A100 80GB GPUs"
-
model-index:
- name: bloom
results:
- task:
type: text-generation
+ name: text generation
+ dataset:
+ name: arc_challenge
+ type: arc_challenge
+ metrics:
+ - name: acc
+ type: acc
+ value: 0.4112627986348123
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: arc_easy
+ type: arc_easy
+ metrics:
+ - name: acc
+ type: acc
+ value: 0.726010101010101
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: axb
+ type: axb
+ metrics:
+ - name: acc
+ type: acc
+ value: 0.5751811594202898
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: axg
+ type: axg
+ metrics:
+ - name: acc
+ type: acc
+ value: 0.5252808988764045
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: boolq
+ type: boolq
+ metrics:
+ - name: acc
+ type: acc
+ value: 0.6345565749235474
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: cb
+ type: cb
+ metrics:
+ - name: acc
+ type: acc
+ value: 0.3392857142857143
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: cola
+ type: cola
+ metrics:
+ - name: acc
+ type: acc
+ value: 0.39022051773729627
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: copa
+ type: copa
+ metrics:
+ - name: acc
+ type: acc
+ value: 0.56
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: crows_pairs_english
+ type: crows_pairs_english
+ metrics:
+ - name: acc
+ type: acc
+ value: 0.5
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: crows_pairs_french
+ type: crows_pairs_french
+ metrics:
+ - name: acc
+ type: acc
+ value: 0.505664877757901
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: diabla
+ type: diabla
+ metrics:
+ - name: acc
+ type: acc
+ value: 0.2947981906750174
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_afr
+ type: gsarti/flores_101_afr
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 4.25431550058444
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_amh
+ type: gsarti/flores_101_amh
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 3.716877477347089
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_ara
+ type: gsarti/flores_101_ara
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 1.7049030137120964
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_asm
+ type: gsarti/flores_101_asm
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 6.576581380404954
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_ast
+ type: gsarti/flores_101_ast
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 2.8562364775797944
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_azj
+ type: gsarti/flores_101_azj
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 4.80721528624391
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_bel
+ type: gsarti/flores_101_bel
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 2.7312177406635065
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_ben
+ type: gsarti/flores_101_ben
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 5.993409478990023
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_bos
+ type: gsarti/flores_101_bos
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 3.5936169095529493
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_bul
+ type: gsarti/flores_101_bul
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 2.159035321398085
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_cat
+ type: gsarti/flores_101_cat
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 2.167873680006659
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_ceb
+ type: gsarti/flores_101_ceb
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 5.286975089885673
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_ces
+ type: gsarti/flores_101_ces
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 3.4516208322236017
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_ckb
+ type: gsarti/flores_101_ckb
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 3.7051034724765612
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_cym
+ type: gsarti/flores_101_cym
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 7.0889312398688125
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_dan
+ type: gsarti/flores_101_dan
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 3.4300748208111838
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_deu
+ type: gsarti/flores_101_deu
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 2.3380585896268107
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_ell
+ type: gsarti/flores_101_ell
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 1.9595604725375586
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_eng
+ type: gsarti/flores_101_eng
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 1.8819637649637901
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_est
+ type: gsarti/flores_101_est
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 5.773850600380297
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_fas
+ type: gsarti/flores_101_fas
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 2.4306140728294086
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_fin
+ type: gsarti/flores_101_fin
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 4.304305536244342
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_fra
+ type: gsarti/flores_101_fra
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 1.9374688438541796
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_ful
+ type: gsarti/flores_101_ful
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 9.740353097219378
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_gle
+ type: gsarti/flores_101_gle
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 6.035269765075012
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_glg
+ type: gsarti/flores_101_glg
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 2.365451129546636
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_guj
+ type: gsarti/flores_101_guj
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 5.70676742569154
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_hau
+ type: gsarti/flores_101_hau
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 8.855204288260023
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_heb
+ type: gsarti/flores_101_heb
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 2.920943798471208
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_hin
+ type: gsarti/flores_101_hin
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 5.452028001573195
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_hrv
+ type: gsarti/flores_101_hrv
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 3.7056829077179225
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_hun
+ type: gsarti/flores_101_hun
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 4.058579478967854
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_hye
+ type: gsarti/flores_101_hye
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 3.127237816041562
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_ibo
+ type: gsarti/flores_101_ibo
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 3.9500357969906683
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_ind
+ type: gsarti/flores_101_ind
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 1.976163584180101
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_isl
+ type: gsarti/flores_101_isl
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 5.500542085165231
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_ita
+ type: gsarti/flores_101_ita
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 2.314465100752677
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_jav
+ type: gsarti/flores_101_jav
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 4.942322446550142
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_jpn
+ type: gsarti/flores_101_jpn
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 2.259421750521777
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_kam
+ type: gsarti/flores_101_kam
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 9.743025325635475
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_kan
+ type: gsarti/flores_101_kan
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 6.233724699944989
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_kat
+ type: gsarti/flores_101_kat
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 2.0508893415872107
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_kaz
+ type: gsarti/flores_101_kaz
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 3.0390148516287927
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_kea
+ type: gsarti/flores_101_kea
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 7.147132270533836
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_khm
+ type: gsarti/flores_101_khm
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 3.366514710252477
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_kir
+ type: gsarti/flores_101_kir
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 3.2413845359487885
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_kor
+ type: gsarti/flores_101_kor
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 2.9023196482741027
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_lao
+ type: gsarti/flores_101_lao
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 2.331446855837494
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_lav
+ type: gsarti/flores_101_lav
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 5.223609016485348
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_lin
+ type: gsarti/flores_101_lin
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 4.847471204107301
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_lit
+ type: gsarti/flores_101_lit
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 4.5432035498036765
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_ltz
+ type: gsarti/flores_101_ltz
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 5.5910516978201015
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_lug
+ type: gsarti/flores_101_lug
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 5.4301049946044175
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_luo
+ type: gsarti/flores_101_luo
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 12.031029857399394
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_mal
+ type: gsarti/flores_101_mal
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 4.794302548141229
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_mar
+ type: gsarti/flores_101_mar
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 6.856682255407709
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_mkd
+ type: gsarti/flores_101_mkd
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 2.3354144607382983
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_mlt
+ type: gsarti/flores_101_mlt
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 9.04135227904975
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_mon
+ type: gsarti/flores_101_mon
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 3.094907723618666
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_mri
+ type: gsarti/flores_101_mri
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 5.2659698341456505
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_msa
+ type: gsarti/flores_101_msa
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 2.2220779892820985
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_mya
+ type: gsarti/flores_101_mya
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 2.5229159853414433
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_nld
+ type: gsarti/flores_101_nld
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 2.799153089002766
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_nob
+ type: gsarti/flores_101_nob
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 3.628942049758715
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_npi
+ type: gsarti/flores_101_npi
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 6.666236527803879
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_nso
+ type: gsarti/flores_101_nso
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 5.015319074943932
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_nya
+ type: gsarti/flores_101_nya
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 4.938044040751036
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_oci
+ type: gsarti/flores_101_oci
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 3.607440766288032
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_orm
+ type: gsarti/flores_101_orm
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 11.31585044916705
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_ory
+ type: gsarti/flores_101_ory
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 5.981891184515959
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_pan
+ type: gsarti/flores_101_pan
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 4.7716086841502685
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_pol
+ type: gsarti/flores_101_pol
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 3.01200174157614
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_por
+ type: gsarti/flores_101_por
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 1.8411472115156693
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_pus
+ type: gsarti/flores_101_pus
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 4.623872921169341
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_ron
+ type: gsarti/flores_101_ron
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 3.049829411973529
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_rus
+ type: gsarti/flores_101_rus
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 1.7083443875791493
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_slk
+ type: gsarti/flores_101_slk
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 4.037719650548048
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_slv
+ type: gsarti/flores_101_slv
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 4.141036287764831
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_sna
+ type: gsarti/flores_101_sna
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 4.7109183690601295
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_snd
+ type: gsarti/flores_101_snd
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 4.206170931541356
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_som
+ type: gsarti/flores_101_som
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 9.154342083821405
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_spa
+ type: gsarti/flores_101_spa
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 1.7955816311143258
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_srp
+ type: gsarti/flores_101_srp
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 2.241096141430147
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_swe
+ type: gsarti/flores_101_swe
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 3.344977179674293
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_swh
+ type: gsarti/flores_101_swh
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 2.6844272218041634
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_tam
+ type: gsarti/flores_101_tam
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 5.1645951632801745
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_tel
+ type: gsarti/flores_101_tel
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 6.8098996634099445
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_tgk
+ type: gsarti/flores_101_tgk
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 3.785457016715163
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_tgl
+ type: gsarti/flores_101_tgl
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 3.7498953645610875
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_tha
+ type: gsarti/flores_101_tha
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 2.104151663233468
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_tur
+ type: gsarti/flores_101_tur
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 3.3178240103796037
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_ukr
+ type: gsarti/flores_101_ukr
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 2.088543437159643
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_umb
+ type: gsarti/flores_101_umb
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 11.766013385445124
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_urd
+ type: gsarti/flores_101_urd
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 1.7788699847612357
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_uzb
+ type: gsarti/flores_101_uzb
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 8.499879863290486
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_vie
+ type: gsarti/flores_101_vie
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 1.65901207387262
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_wol
+ type: gsarti/flores_101_wol
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 6.141703791276928
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_xho
+ type: gsarti/flores_101_xho
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 4.690199677955254
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_yor
+ type: gsarti/flores_101_yor
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 4.360585696242932
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_zho_simpl
+ type: gsarti/flores_101_zho_simpl
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 2.1183545781883515
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_zho_trad
+ type: gsarti/flores_101_zho_trad
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 2.273787884962656
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: gsarti/flores_101_zul
+ type: gsarti/flores_101_zul
+ metrics:
+ - name: byte_perplexity
+ type: byte_perplexity
+ value: 6.016954767729589
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: headqa
+ type: headqa
+ metrics:
+ - name: acc
+ type: acc
+ value: 0.3464624361779723
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: hellaswag
+ type: hellaswag
+ metrics:
+ - name: acc
+ type: acc
+ value: 0.5353515236008763
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: lambada_mt_de
+ type: lambada_mt_de
+ metrics:
+ - name: acc
+ type: acc
+ value: 0.3291286629148069
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: lambada_mt_en
+ type: lambada_mt_en
+ metrics:
+ - name: acc
+ type: acc
+ value: 0.6720357073549389
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: lambada_mt_es
+ type: lambada_mt_es
+ metrics:
+ - name: acc
+ type: acc
+ value: 0.476421502037648
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: lambada_mt_it
+ type: lambada_mt_it
+ metrics:
+ - name: acc
+ type: acc
+ value: 0.4061711624296526
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: logiqa
+ type: logiqa
+ metrics:
+ - name: acc
+ type: acc
+ value: 0.2350230414746544
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: mathqa
+ type: mathqa
+ metrics:
+ - name: acc
+ type: acc
+ value: 0.27671691792294806
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: mc_taco
+ type: mc_taco
+ metrics:
+ - name: em
+ type: em
+ value: 0.13063063063063063
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: mnli
+ type: mnli
+ metrics:
+ - name: acc
+ type: acc
+ value: 0.3545565500406835
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: mnli_mismatched
+ type: mnli_mismatched
+ metrics:
+ - name: acc
+ type: acc
+ value: 0.3545565500406835
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: mrpc
+ type: mrpc
+ metrics:
+ - name: acc
+ type: acc
+ value: 0.3872549019607843
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: multirc
+ type: multirc
+ metrics:
+ - name: acc
+ type: acc
+ value: 0.570957095709571
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: openbookqa
+ type: openbookqa
+ metrics:
+ - name: acc
+ type: acc
+ value: 0.312
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: piqa
+ type: piqa
+ metrics:
+ - name: acc
+ type: acc
+ value: 0.7812840043525572
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: prost
+ type: prost
+ metrics:
+ - name: acc
+ type: acc
+ value: 0.2977156276686593
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: pubmedqa
+ type: pubmedqa
+ metrics:
+ - name: acc
+ type: acc
+ value: 0.741
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: qnli
+ type: qnli
+ metrics:
+ - name: acc
+ type: acc
+ value: 0.5172981878088962
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: qqp
+ type: qqp
+ metrics:
+ - name: acc
+ type: acc
+ value: 0.5883007667573584
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: race
+ type: race
+ metrics:
+ - name: acc
+ type: acc
+ value: 0.39043062200956935
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: rte
+ type: rte
+ metrics:
+ - name: acc
+ type: acc
+ value: 0.5198555956678701
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: sciq
+ type: sciq
+ metrics:
+ - name: acc
+ type: acc
+ value: 0.936
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: sst
+ type: sst
+ metrics:
+ - name: acc
+ type: acc
+ value: 0.6043577981651376
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: triviaqa
+ type: triviaqa
+ metrics:
+ - name: acc
+ type: acc
+ value: 0.18332891363917617
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: tydiqa_primary
+ type: tydiqa_primary
+ metrics:
+ - name: acc
+ type: acc
+ value: 0.2809817301342725
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: webqs
+ type: webqs
+ metrics:
+ - name: acc
+ type: acc
+ value: 0.061515748031496065
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: wic
+ type: wic
+ metrics:
+ - name: acc
+ type: acc
+ value: 0.5062695924764891
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: winogrande
+ type: winogrande
+ metrics:
+ - name: acc
+ type: acc
+ value: 0.7095501183898973
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: wnli
+ type: wnli
+ metrics:
+ - name: acc
+ type: acc
+ value: 0.5704225352112676
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
+ dataset:
+ name: wsc
+ type: wsc
+ metrics:
+ - name: acc
+ type: acc
+ value: 0.5192307692307693
+ verified: false
+ - task:
+ type: text-generation
+ name: text generation
dataset:
- type: openai_humaneval
name: humaneval
+ type: humaneval
metrics:
- name: pass@1
type: pass@1
- value: 0.15542682926829265
+ value: 0.15524390243902436
verified: false
- name: pass@10
type: pass@10
- value: 0.3278356276947017
+ value: 0.3220367632383857
verified: false
- name: pass@100
type: pass@100
- value: 0.5719815685597749
+ value: 0.5545431515723145
verified: false
---
-
+
BigScience Large Open-science Open-access Multilingual Language Model
Version 1.3 / 6 July 2022
Current Checkpoint: **Training Iteration 95000**
-Link to paper: [here](https://arxiv.org/abs/2211.05100)
-
Total seen tokens: **366B**
---
@@ -274,9 +1838,7 @@ Please see [the BLOOM training README](https://github.com/bigscience-workshop/bi
* ALiBI positional encodings (see [paper](https://arxiv.org/pdf/2108.12409.pdf)), with GeLU activation functions
-* 176,247,271,424 parameters:
-
- * 3,596,615,680 embedding parameters
+* 176 billion parameters:
* 70 layers, 112 attention heads
@@ -602,6 +2164,7 @@ Model may:
## Metrics
*This section describes the different ways performance is calculated and why.*
+
Includes:
| Metric | Why chosen |
@@ -625,15 +2188,158 @@ And multiple different metrics for specific tasks. _(More evaluation metrics for
**Zero-shot evaluations:**
-WARNING: This section used to contain much more results, however they were not correct and we released without the approval of the evaluation working group. We are currently in the process of fixing the evaluations.
+# WARNING: These are intermediate results
See this repository for JSON files: https://github.com/bigscience-workshop/evaluation-results
| Task | Language | Metric | BLOOM-176B | OPT-175B* |
|:--------|:-----------------|:------------------------|-------------:|------------:|
+| arc_challenge | eng | acc ↑ | 0.411 | 0.412 |
+| arc_easy | eng | acc ↑ | 0.726 | 0.751 |
+| axb (Median of 10 prompts) | eng | acc ↑ | 0.575 | 0.532 |
+| axg (Median of 10 prompts) | eng | acc ↑ | 0.525 | 0.548 |
+| boolq (Median of 11 prompts) | eng | acc ↑ | 0.635 | 0.622 |
+| cb (Median of 15 prompts) | eng | acc ↑ | 0.339 | 0.411 |
+| cola (Median of 5 prompts) | eng | acc ↑ | 0.39 | 0.444 |
+| copa (Median of 9 prompts) | eng | acc ↑ | 0.56 | 0.55 |
+| crows_pairs_english (Median of 6 prompts) | eng | acc ↑ | 0.5 | 0.502 |
+| crows_pairs_french (Median of 7 prompts) | fra | acc ↑ | 0.506 | 0.499 |
+| diabla (Median of 2 prompts) | eng | acc ↑ | 0.295 | 0.289 |
+| gsarti/flores_101_afr | afr | byte_perplexity ↓ | 4.254 | 3.381 |
+| gsarti/flores_101_amh | amh | byte_perplexity ↓ | 3.717 | 3.87 |
+| gsarti/flores_101_ara | ara | byte_perplexity ↓ | 1.705 | 2.42 |
+| gsarti/flores_101_asm | asm | byte_perplexity ↓ | 6.577 | 3.028 |
+| gsarti/flores_101_ast | ast | byte_perplexity ↓ | 2.856 | 4.737 |
+| gsarti/flores_101_azj | azj | byte_perplexity ↓ | 4.807 | 4.767 |
+| gsarti/flores_101_bel | bel | byte_perplexity ↓ | 2.731 | 2.557 |
+| gsarti/flores_101_ben | ben | byte_perplexity ↓ | 5.993 | 2.243 |
+| gsarti/flores_101_bos | bos | byte_perplexity ↓ | 3.594 | 2.668 |
+| gsarti/flores_101_bul | bul | byte_perplexity ↓ | 2.159 | 2.099 |
+| gsarti/flores_101_cat | cat | byte_perplexity ↓ | 2.168 | 2.837 |
+| gsarti/flores_101_ceb | ceb | byte_perplexity ↓ | 5.287 | 3.636 |
+| gsarti/flores_101_ces | ces | byte_perplexity ↓ | 3.452 | 2.749 |
+| gsarti/flores_101_ckb | ckb | byte_perplexity ↓ | 3.705 | 4.688 |
+| gsarti/flores_101_cym | cym | byte_perplexity ↓ | 7.089 | 5.075 |
+| gsarti/flores_101_dan | dan | byte_perplexity ↓ | 3.43 | 2.492 |
+| gsarti/flores_101_deu | deu | byte_perplexity ↓ | 2.338 | 2.099 |
+| gsarti/flores_101_ell | ell | byte_perplexity ↓ | 1.96 | 1.811 |
+| gsarti/flores_101_eng | eng | byte_perplexity ↓ | 1.882 | 1.9 |
+| gsarti/flores_101_est | est | byte_perplexity ↓ | 5.774 | 3.533 |
+| gsarti/flores_101_fas | fas | byte_perplexity ↓ | 2.431 | 2.444 |
+| gsarti/flores_101_fin | fin | byte_perplexity ↓ | 4.304 | 2.601 |
+| gsarti/flores_101_fra | fra | byte_perplexity ↓ | 1.937 | 1.984 |
+| gsarti/flores_101_ful | ful | byte_perplexity ↓ | 9.74 | 11.84 |
+| gsarti/flores_101_gle | gle | byte_perplexity ↓ | 6.035 | 3.914 |
+| gsarti/flores_101_glg | glg | byte_perplexity ↓ | 2.365 | 3.015 |
+| gsarti/flores_101_guj | guj | byte_perplexity ↓ | 5.707 | 2.438 |
+| gsarti/flores_101_hau | hau | byte_perplexity ↓ | 8.855 | 5.283 |
+| gsarti/flores_101_heb | heb | byte_perplexity ↓ | 2.921 | 2.903 |
+| gsarti/flores_101_hin | hin | byte_perplexity ↓ | 5.452 | 1.86 |
+| gsarti/flores_101_hrv | hrv | byte_perplexity ↓ | 3.706 | 2.715 |
+| gsarti/flores_101_hun | hun | byte_perplexity ↓ | 4.059 | 2.865 |
+| gsarti/flores_101_hye | hye | byte_perplexity ↓ | 3.127 | 3.411 |
+| gsarti/flores_101_ibo | ibo | byte_perplexity ↓ | 3.95 | 8.008 |
+| gsarti/flores_101_ind | ind | byte_perplexity ↓ | 1.976 | 2.632 |
+| gsarti/flores_101_isl | isl | byte_perplexity ↓ | 5.501 | 4.701 |
+| gsarti/flores_101_ita | ita | byte_perplexity ↓ | 2.314 | 2.104 |
+| gsarti/flores_101_jav | jav | byte_perplexity ↓ | 4.942 | 8.16 |
+| gsarti/flores_101_jpn | jpn | byte_perplexity ↓ | 2.259 | 2.198 |
+| gsarti/flores_101_kam | kam | byte_perplexity ↓ | 9.743 | 10.981 |
+| gsarti/flores_101_kan | kan | byte_perplexity ↓ | 6.234 | 2.373 |
+| gsarti/flores_101_kat | kat | byte_perplexity ↓ | 2.051 | 2.466 |
+| gsarti/flores_101_kaz | kaz | byte_perplexity ↓ | 3.039 | 4.376 |
+| gsarti/flores_101_kea | kea | byte_perplexity ↓ | 7.147 | 9.632 |
+| gsarti/flores_101_khm | khm | byte_perplexity ↓ | 3.367 | 2.646 |
+| gsarti/flores_101_kir | kir | byte_perplexity ↓ | 3.241 | 4.522 |
+| gsarti/flores_101_kor | kor | byte_perplexity ↓ | 2.902 | 3.376 |
+| gsarti/flores_101_lao | lao | byte_perplexity ↓ | 2.331 | 3.106 |
+| gsarti/flores_101_lav | lav | byte_perplexity ↓ | 5.224 | 4.811 |
+| gsarti/flores_101_lin | lin | byte_perplexity ↓ | 4.847 | 8.871 |
+| gsarti/flores_101_lit | lit | byte_perplexity ↓ | 4.543 | 5.183 |
+| gsarti/flores_101_ltz | ltz | byte_perplexity ↓ | 5.591 | 7.158 |
+| gsarti/flores_101_lug | lug | byte_perplexity ↓ | 5.43 | 7.399 |
+| gsarti/flores_101_luo | luo | byte_perplexity ↓ | 12.031 | 11.951 |
+| gsarti/flores_101_mal | mal | byte_perplexity ↓ | 4.794 | 2.054 |
+| gsarti/flores_101_mar | mar | byte_perplexity ↓ | 6.857 | 2.274 |
+| gsarti/flores_101_mkd | mkd | byte_perplexity ↓ | 2.335 | 2.538 |
+| gsarti/flores_101_mlt | mlt | byte_perplexity ↓ | 9.041 | 5.996 |
+| gsarti/flores_101_mon | mon | byte_perplexity ↓ | 3.095 | 4.519 |
+| gsarti/flores_101_mri | mri | byte_perplexity ↓ | 5.266 | 4.438 |
+| gsarti/flores_101_msa | msa | byte_perplexity ↓ | 2.222 | 2.935 |
+| gsarti/flores_101_mya | mya | byte_perplexity ↓ | 2.523 | 2.413 |
+| gsarti/flores_101_nld | nld | byte_perplexity ↓ | 2.799 | 2.293 |
+| gsarti/flores_101_nob | nob | byte_perplexity ↓ | 3.629 | 2.593 |
+| gsarti/flores_101_npi | npi | byte_perplexity ↓ | 6.666 | 2.499 |
+| gsarti/flores_101_nso | nso | byte_perplexity ↓ | 5.015 | 8.485 |
+| gsarti/flores_101_nya | nya | byte_perplexity ↓ | 4.938 | 7.548 |
+| gsarti/flores_101_oci | oci | byte_perplexity ↓ | 3.607 | 4.936 |
+| gsarti/flores_101_orm | orm | byte_perplexity ↓ | 11.316 | 7.145 |
+| gsarti/flores_101_ory | ory | byte_perplexity ↓ | 5.982 | 2.668 |
+| gsarti/flores_101_pan | pan | byte_perplexity ↓ | 4.772 | 2.782 |
+| gsarti/flores_101_pol | pol | byte_perplexity ↓ | 3.012 | 2.432 |
+| gsarti/flores_101_por | por | byte_perplexity ↓ | 1.841 | 2.178 |
+| gsarti/flores_101_pus | pus | byte_perplexity ↓ | 4.624 | 4.785 |
+| gsarti/flores_101_ron | ron | byte_perplexity ↓ | 3.05 | 2.197 |
+| gsarti/flores_101_rus | rus | byte_perplexity ↓ | 1.708 | 1.689 |
+| gsarti/flores_101_slk | slk | byte_perplexity ↓ | 4.038 | 3.419 |
+| gsarti/flores_101_slv | slv | byte_perplexity ↓ | 4.141 | 3.582 |
+| gsarti/flores_101_sna | sna | byte_perplexity ↓ | 4.711 | 5.588 |
+| gsarti/flores_101_snd | snd | byte_perplexity ↓ | 4.206 | 5.667 |
+| gsarti/flores_101_som | som | byte_perplexity ↓ | 9.154 | 4.788 |
+| gsarti/flores_101_spa | spa | byte_perplexity ↓ | 1.796 | 2.098 |
+| gsarti/flores_101_srp | srp | byte_perplexity ↓ | 2.241 | 2.688 |
+| gsarti/flores_101_swe | swe | byte_perplexity ↓ | 3.345 | 2.468 |
+| gsarti/flores_101_swh | swh | byte_perplexity ↓ | 2.684 | 4.473 |
+| gsarti/flores_101_tam | tam | byte_perplexity ↓ | 5.165 | 2.024 |
+| gsarti/flores_101_tel | tel | byte_perplexity ↓ | 6.81 | 2.407 |
+| gsarti/flores_101_tgk | tgk | byte_perplexity ↓ | 3.785 | 4.899 |
+| gsarti/flores_101_tgl | tgl | byte_perplexity ↓ | 3.75 | 2.738 |
+| gsarti/flores_101_tha | tha | byte_perplexity ↓ | 2.104 | 2.035 |
+| gsarti/flores_101_tur | tur | byte_perplexity ↓ | 3.318 | 2.622 |
+| gsarti/flores_101_ukr | ukr | byte_perplexity ↓ | 2.089 | 1.93 |
+| gsarti/flores_101_umb | umb | byte_perplexity ↓ | 11.766 | 11.64 |
+| gsarti/flores_101_urd | urd | byte_perplexity ↓ | 1.779 | 2.982 |
+| gsarti/flores_101_uzb | uzb | byte_perplexity ↓ | 8.5 | 13.209 |
+| gsarti/flores_101_vie | vie | byte_perplexity ↓ | 1.659 | 2.229 |
+| gsarti/flores_101_wol | wol | byte_perplexity ↓ | 6.142 | 13.945 |
+| gsarti/flores_101_xho | xho | byte_perplexity ↓ | 4.69 | 8.42 |
+| gsarti/flores_101_yor | yor | byte_perplexity ↓ | 4.361 | 7.636 |
+| gsarti/flores_101_zho_simpl | zho_simpl | byte_perplexity ↓ | 2.118 | 5.113 |
+| gsarti/flores_101_zho_trad | zho_trad | byte_perplexity ↓ | 2.274 | 5.67 |
+| gsarti/flores_101_zul | zul | byte_perplexity ↓ | 6.017 | 7.341 |
+| headqa | esp | acc ↑ | 0.346 | 0.244 |
+| hellaswag | eng | acc ↑ | 0.535 | 0.592 |
+| lambada_mt_de | deu | acc ↑ | 0.329 | 0.358 |
+| lambada_mt_en | eng | acc ↑ | 0.672 | 0.747 |
+| lambada_mt_es | esp | acc ↑ | 0.476 | 0.397 |
+| lambada_mt_it | ita | acc ↑ | 0.406 | 0.409 |
+| logiqa | eng | acc ↑ | 0.235 | 0.244 |
+| mathqa | eng | acc ↑ | 0.277 | 0.268 |
+| mc_taco | eng | em ↑ | 0.131 | 0.124 |
+| mnli (Median of 15 prompts) | eng | acc ↑ | 0.355 | 0.36 |
+| mnli_mismatched (Median of 15 prompts) | eng | acc ↑ | 0.355 | 0.36 |
+| mrpc | eng | acc ↑ | 0.387 | 0.446 |
+| multirc (Median of 11 prompts) | eng | acc ↑ | 0.571 | 0.599 |
+| openbookqa | eng | acc ↑ | 0.312 | 0.322 |
+| piqa | eng | acc ↑ | 0.781 | 0.791 |
+| prost | eng | acc ↑ | 0.298 | 0.299 |
+| pubmedqa | eng | acc ↑ | 0.741 | 0.709 |
+| qnli | eng | acc ↑ | 0.517 | 0.554 |
+| qqp (Median of 7 prompts) | eng | acc ↑ | 0.588 | 0.395 |
+| race | eng | acc ↑ | 0.39 | 0.402 |
+| rte (Median of 6 prompts) | eng | acc ↑ | 0.52 | 0.495 |
+| sciq | eng | acc ↑ | 0.936 | 0.948 |
+| sst (Median of 6 prompts) | eng | acc ↑ | 0.604 | 0.647 |
+| triviaqa | eng | acc ↑ | 0.183 | 0.342 |
+| tydiqa_primary (Median of 16 prompts) | eng | acc ↑ | 0.281 | 0.148 |
+| webqs | eng | acc ↑ | 0.062 | 0.159 |
+| wic (Median of 11 prompts) | eng | acc ↑ | 0.506 | 0.498 |
+| winogrande | eng | acc ↑ | 0.71 | 0.736 |
+| wnli (Median of 6 prompts) | eng | acc ↑ | 0.57 | 0.563 |
+| wsc (Median of 11 prompts) | eng | acc ↑ | 0.519 | 0.413 |
| humaneval | python | pass@1 ↑ | 0.155 | 0.0 |
-| humaneval | python | pass@10 ↑ | 0.328 | 0.0 |
-| humaneval | python | pass@100 ↑ | 0.572 | 0.003 |
+| humaneval | python | pass@10 ↑ | 0.322 | 0.0 |
+| humaneval | python | pass@100 ↑ | 0.555 | 0.003 |
**Train-time Evaluation:**
@@ -741,11 +2447,9 @@ Initial prompting experiments using interim checkpoints: https://huggingface.co/
The checkpoints in this repo correspond to the HuggingFace Transformers format. If you want to use our fork of [Megatron-DeepSpeed](https://github.com/bigscience-workshop/Megatron-DeepSpeed) that the model was trained with, you'd want to use [this repo instead](https://huggingface.co/bigscience/bloom-optimizer-states).
-Many intermediate checkpoints are available at https://huggingface.co/bigscience/bloom-intermediate/
-
---
# Model Card Authors
-*Ordered roughly chronologically and by amount of time spent on creating this model card.*
+*Ordered roughly chronologically and by amount of time spent.*
Margaret Mitchell, Giada Pistilli, Yacine Jernite, Ezinwanne Ozoani, Marissa Gerchick, Nazneen Rajani, Sasha Luccioni, Irene Solaiman, Maraim Masoud, Somaieh Nikpoor, Carlos Muñoz Ferrandis, Stas Bekman, Christopher Akiki, Danish Contractor, David Lansky, Angelina McMillan-Major, Tristan Thrush, Suzana Ilić, Gérard Dupont, Shayne Longpre, Manan Dey, Stella Biderman, Douwe Kiela, Emi Baylor, Teven Le Scao, Aaron Gokaslan, Julien Launay, Niklas Muennighoff
\ No newline at end of file
diff --git a/config.json b/config.json
index e9d0a5574e859ae07790c378d5c69a5a2aae5c9e..c8079e2826ac7108a7ca2ba54c47ec0cab8b18f8 100644
--- a/config.json
+++ b/config.json
@@ -2,7 +2,7 @@
"apply_residual_connection_post_layernorm": false,
"attention_dropout": 0.0,
"architectures": [
- "BloomForCausalLM"
+ "BloomModel"
],
"attention_softmax_in_fp32": true,
"pad_token_id": 3,
@@ -21,4 +21,4 @@
"transformers_version": "4.21.0",
"use_cache": true,
"vocab_size": 250880
-}
\ No newline at end of file
+}
diff --git a/model.safetensors.index.json b/model.safetensors.index.json
deleted file mode 100644
index 7dd7b9f86c92d65291d95a8bd02b5fe73c545c81..0000000000000000000000000000000000000000
--- a/model.safetensors.index.json
+++ /dev/null
@@ -1,852 +0,0 @@
-{
- "metadata": {
- "total_size": 352494542848
- },
- "weight_map": {
- "h.0.input_layernorm.bias": "model_00002-of-00072.safetensors",
- "h.0.input_layernorm.weight": "model_00002-of-00072.safetensors",
- "h.0.mlp.dense_4h_to_h.bias": "model_00002-of-00072.safetensors",
- "h.0.mlp.dense_4h_to_h.weight": "model_00002-of-00072.safetensors",
- "h.0.mlp.dense_h_to_4h.bias": "model_00002-of-00072.safetensors",
- "h.0.mlp.dense_h_to_4h.weight": "model_00002-of-00072.safetensors",
- "h.0.post_attention_layernorm.bias": "model_00002-of-00072.safetensors",
- "h.0.post_attention_layernorm.weight": "model_00002-of-00072.safetensors",
- "h.0.self_attention.dense.bias": "model_00002-of-00072.safetensors",
- "h.0.self_attention.dense.weight": "model_00002-of-00072.safetensors",
- "h.0.self_attention.query_key_value.bias": "model_00002-of-00072.safetensors",
- "h.0.self_attention.query_key_value.weight": "model_00002-of-00072.safetensors",
- "h.1.input_layernorm.bias": "model_00003-of-00072.safetensors",
- "h.1.input_layernorm.weight": "model_00003-of-00072.safetensors",
- "h.1.mlp.dense_4h_to_h.bias": "model_00003-of-00072.safetensors",
- "h.1.mlp.dense_4h_to_h.weight": "model_00003-of-00072.safetensors",
- "h.1.mlp.dense_h_to_4h.bias": "model_00003-of-00072.safetensors",
- "h.1.mlp.dense_h_to_4h.weight": "model_00003-of-00072.safetensors",
- "h.1.post_attention_layernorm.bias": "model_00003-of-00072.safetensors",
- "h.1.post_attention_layernorm.weight": "model_00003-of-00072.safetensors",
- "h.1.self_attention.dense.bias": "model_00003-of-00072.safetensors",
- "h.1.self_attention.dense.weight": "model_00003-of-00072.safetensors",
- "h.1.self_attention.query_key_value.bias": "model_00003-of-00072.safetensors",
- "h.1.self_attention.query_key_value.weight": "model_00003-of-00072.safetensors",
- "h.10.input_layernorm.bias": "model_00012-of-00072.safetensors",
- "h.10.input_layernorm.weight": "model_00012-of-00072.safetensors",
- "h.10.mlp.dense_4h_to_h.bias": "model_00012-of-00072.safetensors",
- "h.10.mlp.dense_4h_to_h.weight": "model_00012-of-00072.safetensors",
- "h.10.mlp.dense_h_to_4h.bias": "model_00012-of-00072.safetensors",
- "h.10.mlp.dense_h_to_4h.weight": "model_00012-of-00072.safetensors",
- "h.10.post_attention_layernorm.bias": "model_00012-of-00072.safetensors",
- "h.10.post_attention_layernorm.weight": "model_00012-of-00072.safetensors",
- "h.10.self_attention.dense.bias": "model_00012-of-00072.safetensors",
- "h.10.self_attention.dense.weight": "model_00012-of-00072.safetensors",
- "h.10.self_attention.query_key_value.bias": "model_00012-of-00072.safetensors",
- "h.10.self_attention.query_key_value.weight": "model_00012-of-00072.safetensors",
- "h.11.input_layernorm.bias": "model_00013-of-00072.safetensors",
- "h.11.input_layernorm.weight": "model_00013-of-00072.safetensors",
- "h.11.mlp.dense_4h_to_h.bias": "model_00013-of-00072.safetensors",
- "h.11.mlp.dense_4h_to_h.weight": "model_00013-of-00072.safetensors",
- "h.11.mlp.dense_h_to_4h.bias": "model_00013-of-00072.safetensors",
- "h.11.mlp.dense_h_to_4h.weight": "model_00013-of-00072.safetensors",
- "h.11.post_attention_layernorm.bias": "model_00013-of-00072.safetensors",
- "h.11.post_attention_layernorm.weight": "model_00013-of-00072.safetensors",
- "h.11.self_attention.dense.bias": "model_00013-of-00072.safetensors",
- "h.11.self_attention.dense.weight": "model_00013-of-00072.safetensors",
- "h.11.self_attention.query_key_value.bias": "model_00013-of-00072.safetensors",
- "h.11.self_attention.query_key_value.weight": "model_00013-of-00072.safetensors",
- "h.12.input_layernorm.bias": "model_00014-of-00072.safetensors",
- "h.12.input_layernorm.weight": "model_00014-of-00072.safetensors",
- "h.12.mlp.dense_4h_to_h.bias": "model_00014-of-00072.safetensors",
- "h.12.mlp.dense_4h_to_h.weight": "model_00014-of-00072.safetensors",
- "h.12.mlp.dense_h_to_4h.bias": "model_00014-of-00072.safetensors",
- "h.12.mlp.dense_h_to_4h.weight": "model_00014-of-00072.safetensors",
- "h.12.post_attention_layernorm.bias": "model_00014-of-00072.safetensors",
- "h.12.post_attention_layernorm.weight": "model_00014-of-00072.safetensors",
- "h.12.self_attention.dense.bias": "model_00014-of-00072.safetensors",
- "h.12.self_attention.dense.weight": "model_00014-of-00072.safetensors",
- "h.12.self_attention.query_key_value.bias": "model_00014-of-00072.safetensors",
- "h.12.self_attention.query_key_value.weight": "model_00014-of-00072.safetensors",
- "h.13.input_layernorm.bias": "model_00015-of-00072.safetensors",
- "h.13.input_layernorm.weight": "model_00015-of-00072.safetensors",
- "h.13.mlp.dense_4h_to_h.bias": "model_00015-of-00072.safetensors",
- "h.13.mlp.dense_4h_to_h.weight": "model_00015-of-00072.safetensors",
- "h.13.mlp.dense_h_to_4h.bias": "model_00015-of-00072.safetensors",
- "h.13.mlp.dense_h_to_4h.weight": "model_00015-of-00072.safetensors",
- "h.13.post_attention_layernorm.bias": "model_00015-of-00072.safetensors",
- "h.13.post_attention_layernorm.weight": "model_00015-of-00072.safetensors",
- "h.13.self_attention.dense.bias": "model_00015-of-00072.safetensors",
- "h.13.self_attention.dense.weight": "model_00015-of-00072.safetensors",
- "h.13.self_attention.query_key_value.bias": "model_00015-of-00072.safetensors",
- "h.13.self_attention.query_key_value.weight": "model_00015-of-00072.safetensors",
- "h.14.input_layernorm.bias": "model_00016-of-00072.safetensors",
- "h.14.input_layernorm.weight": "model_00016-of-00072.safetensors",
- "h.14.mlp.dense_4h_to_h.bias": "model_00016-of-00072.safetensors",
- "h.14.mlp.dense_4h_to_h.weight": "model_00016-of-00072.safetensors",
- "h.14.mlp.dense_h_to_4h.bias": "model_00016-of-00072.safetensors",
- "h.14.mlp.dense_h_to_4h.weight": "model_00016-of-00072.safetensors",
- "h.14.post_attention_layernorm.bias": "model_00016-of-00072.safetensors",
- "h.14.post_attention_layernorm.weight": "model_00016-of-00072.safetensors",
- "h.14.self_attention.dense.bias": "model_00016-of-00072.safetensors",
- "h.14.self_attention.dense.weight": "model_00016-of-00072.safetensors",
- "h.14.self_attention.query_key_value.bias": "model_00016-of-00072.safetensors",
- "h.14.self_attention.query_key_value.weight": "model_00016-of-00072.safetensors",
- "h.15.input_layernorm.bias": "model_00017-of-00072.safetensors",
- "h.15.input_layernorm.weight": "model_00017-of-00072.safetensors",
- "h.15.mlp.dense_4h_to_h.bias": "model_00017-of-00072.safetensors",
- "h.15.mlp.dense_4h_to_h.weight": "model_00017-of-00072.safetensors",
- "h.15.mlp.dense_h_to_4h.bias": "model_00017-of-00072.safetensors",
- "h.15.mlp.dense_h_to_4h.weight": "model_00017-of-00072.safetensors",
- "h.15.post_attention_layernorm.bias": "model_00017-of-00072.safetensors",
- "h.15.post_attention_layernorm.weight": "model_00017-of-00072.safetensors",
- "h.15.self_attention.dense.bias": "model_00017-of-00072.safetensors",
- "h.15.self_attention.dense.weight": "model_00017-of-00072.safetensors",
- "h.15.self_attention.query_key_value.bias": "model_00017-of-00072.safetensors",
- "h.15.self_attention.query_key_value.weight": "model_00017-of-00072.safetensors",
- "h.16.input_layernorm.bias": "model_00018-of-00072.safetensors",
- "h.16.input_layernorm.weight": "model_00018-of-00072.safetensors",
- "h.16.mlp.dense_4h_to_h.bias": "model_00018-of-00072.safetensors",
- "h.16.mlp.dense_4h_to_h.weight": "model_00018-of-00072.safetensors",
- "h.16.mlp.dense_h_to_4h.bias": "model_00018-of-00072.safetensors",
- "h.16.mlp.dense_h_to_4h.weight": "model_00018-of-00072.safetensors",
- "h.16.post_attention_layernorm.bias": "model_00018-of-00072.safetensors",
- "h.16.post_attention_layernorm.weight": "model_00018-of-00072.safetensors",
- "h.16.self_attention.dense.bias": "model_00018-of-00072.safetensors",
- "h.16.self_attention.dense.weight": "model_00018-of-00072.safetensors",
- "h.16.self_attention.query_key_value.bias": "model_00018-of-00072.safetensors",
- "h.16.self_attention.query_key_value.weight": "model_00018-of-00072.safetensors",
- "h.17.input_layernorm.bias": "model_00019-of-00072.safetensors",
- "h.17.input_layernorm.weight": "model_00019-of-00072.safetensors",
- "h.17.mlp.dense_4h_to_h.bias": "model_00019-of-00072.safetensors",
- "h.17.mlp.dense_4h_to_h.weight": "model_00019-of-00072.safetensors",
- "h.17.mlp.dense_h_to_4h.bias": "model_00019-of-00072.safetensors",
- "h.17.mlp.dense_h_to_4h.weight": "model_00019-of-00072.safetensors",
- "h.17.post_attention_layernorm.bias": "model_00019-of-00072.safetensors",
- "h.17.post_attention_layernorm.weight": "model_00019-of-00072.safetensors",
- "h.17.self_attention.dense.bias": "model_00019-of-00072.safetensors",
- "h.17.self_attention.dense.weight": "model_00019-of-00072.safetensors",
- "h.17.self_attention.query_key_value.bias": "model_00019-of-00072.safetensors",
- "h.17.self_attention.query_key_value.weight": "model_00019-of-00072.safetensors",
- "h.18.input_layernorm.bias": "model_00020-of-00072.safetensors",
- "h.18.input_layernorm.weight": "model_00020-of-00072.safetensors",
- "h.18.mlp.dense_4h_to_h.bias": "model_00020-of-00072.safetensors",
- "h.18.mlp.dense_4h_to_h.weight": "model_00020-of-00072.safetensors",
- "h.18.mlp.dense_h_to_4h.bias": "model_00020-of-00072.safetensors",
- "h.18.mlp.dense_h_to_4h.weight": "model_00020-of-00072.safetensors",
- "h.18.post_attention_layernorm.bias": "model_00020-of-00072.safetensors",
- "h.18.post_attention_layernorm.weight": "model_00020-of-00072.safetensors",
- "h.18.self_attention.dense.bias": "model_00020-of-00072.safetensors",
- "h.18.self_attention.dense.weight": "model_00020-of-00072.safetensors",
- "h.18.self_attention.query_key_value.bias": "model_00020-of-00072.safetensors",
- "h.18.self_attention.query_key_value.weight": "model_00020-of-00072.safetensors",
- "h.19.input_layernorm.bias": "model_00021-of-00072.safetensors",
- "h.19.input_layernorm.weight": "model_00021-of-00072.safetensors",
- "h.19.mlp.dense_4h_to_h.bias": "model_00021-of-00072.safetensors",
- "h.19.mlp.dense_4h_to_h.weight": "model_00021-of-00072.safetensors",
- "h.19.mlp.dense_h_to_4h.bias": "model_00021-of-00072.safetensors",
- "h.19.mlp.dense_h_to_4h.weight": "model_00021-of-00072.safetensors",
- "h.19.post_attention_layernorm.bias": "model_00021-of-00072.safetensors",
- "h.19.post_attention_layernorm.weight": "model_00021-of-00072.safetensors",
- "h.19.self_attention.dense.bias": "model_00021-of-00072.safetensors",
- "h.19.self_attention.dense.weight": "model_00021-of-00072.safetensors",
- "h.19.self_attention.query_key_value.bias": "model_00021-of-00072.safetensors",
- "h.19.self_attention.query_key_value.weight": "model_00021-of-00072.safetensors",
- "h.2.input_layernorm.bias": "model_00004-of-00072.safetensors",
- "h.2.input_layernorm.weight": "model_00004-of-00072.safetensors",
- "h.2.mlp.dense_4h_to_h.bias": "model_00004-of-00072.safetensors",
- "h.2.mlp.dense_4h_to_h.weight": "model_00004-of-00072.safetensors",
- "h.2.mlp.dense_h_to_4h.bias": "model_00004-of-00072.safetensors",
- "h.2.mlp.dense_h_to_4h.weight": "model_00004-of-00072.safetensors",
- "h.2.post_attention_layernorm.bias": "model_00004-of-00072.safetensors",
- "h.2.post_attention_layernorm.weight": "model_00004-of-00072.safetensors",
- "h.2.self_attention.dense.bias": "model_00004-of-00072.safetensors",
- "h.2.self_attention.dense.weight": "model_00004-of-00072.safetensors",
- "h.2.self_attention.query_key_value.bias": "model_00004-of-00072.safetensors",
- "h.2.self_attention.query_key_value.weight": "model_00004-of-00072.safetensors",
- "h.20.input_layernorm.bias": "model_00022-of-00072.safetensors",
- "h.20.input_layernorm.weight": "model_00022-of-00072.safetensors",
- "h.20.mlp.dense_4h_to_h.bias": "model_00022-of-00072.safetensors",
- "h.20.mlp.dense_4h_to_h.weight": "model_00022-of-00072.safetensors",
- "h.20.mlp.dense_h_to_4h.bias": "model_00022-of-00072.safetensors",
- "h.20.mlp.dense_h_to_4h.weight": "model_00022-of-00072.safetensors",
- "h.20.post_attention_layernorm.bias": "model_00022-of-00072.safetensors",
- "h.20.post_attention_layernorm.weight": "model_00022-of-00072.safetensors",
- "h.20.self_attention.dense.bias": "model_00022-of-00072.safetensors",
- "h.20.self_attention.dense.weight": "model_00022-of-00072.safetensors",
- "h.20.self_attention.query_key_value.bias": "model_00022-of-00072.safetensors",
- "h.20.self_attention.query_key_value.weight": "model_00022-of-00072.safetensors",
- "h.21.input_layernorm.bias": "model_00023-of-00072.safetensors",
- "h.21.input_layernorm.weight": "model_00023-of-00072.safetensors",
- "h.21.mlp.dense_4h_to_h.bias": "model_00023-of-00072.safetensors",
- "h.21.mlp.dense_4h_to_h.weight": "model_00023-of-00072.safetensors",
- "h.21.mlp.dense_h_to_4h.bias": "model_00023-of-00072.safetensors",
- "h.21.mlp.dense_h_to_4h.weight": "model_00023-of-00072.safetensors",
- "h.21.post_attention_layernorm.bias": "model_00023-of-00072.safetensors",
- "h.21.post_attention_layernorm.weight": "model_00023-of-00072.safetensors",
- "h.21.self_attention.dense.bias": "model_00023-of-00072.safetensors",
- "h.21.self_attention.dense.weight": "model_00023-of-00072.safetensors",
- "h.21.self_attention.query_key_value.bias": "model_00023-of-00072.safetensors",
- "h.21.self_attention.query_key_value.weight": "model_00023-of-00072.safetensors",
- "h.22.input_layernorm.bias": "model_00024-of-00072.safetensors",
- "h.22.input_layernorm.weight": "model_00024-of-00072.safetensors",
- "h.22.mlp.dense_4h_to_h.bias": "model_00024-of-00072.safetensors",
- "h.22.mlp.dense_4h_to_h.weight": "model_00024-of-00072.safetensors",
- "h.22.mlp.dense_h_to_4h.bias": "model_00024-of-00072.safetensors",
- "h.22.mlp.dense_h_to_4h.weight": "model_00024-of-00072.safetensors",
- "h.22.post_attention_layernorm.bias": "model_00024-of-00072.safetensors",
- "h.22.post_attention_layernorm.weight": "model_00024-of-00072.safetensors",
- "h.22.self_attention.dense.bias": "model_00024-of-00072.safetensors",
- "h.22.self_attention.dense.weight": "model_00024-of-00072.safetensors",
- "h.22.self_attention.query_key_value.bias": "model_00024-of-00072.safetensors",
- "h.22.self_attention.query_key_value.weight": "model_00024-of-00072.safetensors",
- "h.23.input_layernorm.bias": "model_00025-of-00072.safetensors",
- "h.23.input_layernorm.weight": "model_00025-of-00072.safetensors",
- "h.23.mlp.dense_4h_to_h.bias": "model_00025-of-00072.safetensors",
- "h.23.mlp.dense_4h_to_h.weight": "model_00025-of-00072.safetensors",
- "h.23.mlp.dense_h_to_4h.bias": "model_00025-of-00072.safetensors",
- "h.23.mlp.dense_h_to_4h.weight": "model_00025-of-00072.safetensors",
- "h.23.post_attention_layernorm.bias": "model_00025-of-00072.safetensors",
- "h.23.post_attention_layernorm.weight": "model_00025-of-00072.safetensors",
- "h.23.self_attention.dense.bias": "model_00025-of-00072.safetensors",
- "h.23.self_attention.dense.weight": "model_00025-of-00072.safetensors",
- "h.23.self_attention.query_key_value.bias": "model_00025-of-00072.safetensors",
- "h.23.self_attention.query_key_value.weight": "model_00025-of-00072.safetensors",
- "h.24.input_layernorm.bias": "model_00026-of-00072.safetensors",
- "h.24.input_layernorm.weight": "model_00026-of-00072.safetensors",
- "h.24.mlp.dense_4h_to_h.bias": "model_00026-of-00072.safetensors",
- "h.24.mlp.dense_4h_to_h.weight": "model_00026-of-00072.safetensors",
- "h.24.mlp.dense_h_to_4h.bias": "model_00026-of-00072.safetensors",
- "h.24.mlp.dense_h_to_4h.weight": "model_00026-of-00072.safetensors",
- "h.24.post_attention_layernorm.bias": "model_00026-of-00072.safetensors",
- "h.24.post_attention_layernorm.weight": "model_00026-of-00072.safetensors",
- "h.24.self_attention.dense.bias": "model_00026-of-00072.safetensors",
- "h.24.self_attention.dense.weight": "model_00026-of-00072.safetensors",
- "h.24.self_attention.query_key_value.bias": "model_00026-of-00072.safetensors",
- "h.24.self_attention.query_key_value.weight": "model_00026-of-00072.safetensors",
- "h.25.input_layernorm.bias": "model_00027-of-00072.safetensors",
- "h.25.input_layernorm.weight": "model_00027-of-00072.safetensors",
- "h.25.mlp.dense_4h_to_h.bias": "model_00027-of-00072.safetensors",
- "h.25.mlp.dense_4h_to_h.weight": "model_00027-of-00072.safetensors",
- "h.25.mlp.dense_h_to_4h.bias": "model_00027-of-00072.safetensors",
- "h.25.mlp.dense_h_to_4h.weight": "model_00027-of-00072.safetensors",
- "h.25.post_attention_layernorm.bias": "model_00027-of-00072.safetensors",
- "h.25.post_attention_layernorm.weight": "model_00027-of-00072.safetensors",
- "h.25.self_attention.dense.bias": "model_00027-of-00072.safetensors",
- "h.25.self_attention.dense.weight": "model_00027-of-00072.safetensors",
- "h.25.self_attention.query_key_value.bias": "model_00027-of-00072.safetensors",
- "h.25.self_attention.query_key_value.weight": "model_00027-of-00072.safetensors",
- "h.26.input_layernorm.bias": "model_00028-of-00072.safetensors",
- "h.26.input_layernorm.weight": "model_00028-of-00072.safetensors",
- "h.26.mlp.dense_4h_to_h.bias": "model_00028-of-00072.safetensors",
- "h.26.mlp.dense_4h_to_h.weight": "model_00028-of-00072.safetensors",
- "h.26.mlp.dense_h_to_4h.bias": "model_00028-of-00072.safetensors",
- "h.26.mlp.dense_h_to_4h.weight": "model_00028-of-00072.safetensors",
- "h.26.post_attention_layernorm.bias": "model_00028-of-00072.safetensors",
- "h.26.post_attention_layernorm.weight": "model_00028-of-00072.safetensors",
- "h.26.self_attention.dense.bias": "model_00028-of-00072.safetensors",
- "h.26.self_attention.dense.weight": "model_00028-of-00072.safetensors",
- "h.26.self_attention.query_key_value.bias": "model_00028-of-00072.safetensors",
- "h.26.self_attention.query_key_value.weight": "model_00028-of-00072.safetensors",
- "h.27.input_layernorm.bias": "model_00029-of-00072.safetensors",
- "h.27.input_layernorm.weight": "model_00029-of-00072.safetensors",
- "h.27.mlp.dense_4h_to_h.bias": "model_00029-of-00072.safetensors",
- "h.27.mlp.dense_4h_to_h.weight": "model_00029-of-00072.safetensors",
- "h.27.mlp.dense_h_to_4h.bias": "model_00029-of-00072.safetensors",
- "h.27.mlp.dense_h_to_4h.weight": "model_00029-of-00072.safetensors",
- "h.27.post_attention_layernorm.bias": "model_00029-of-00072.safetensors",
- "h.27.post_attention_layernorm.weight": "model_00029-of-00072.safetensors",
- "h.27.self_attention.dense.bias": "model_00029-of-00072.safetensors",
- "h.27.self_attention.dense.weight": "model_00029-of-00072.safetensors",
- "h.27.self_attention.query_key_value.bias": "model_00029-of-00072.safetensors",
- "h.27.self_attention.query_key_value.weight": "model_00029-of-00072.safetensors",
- "h.28.input_layernorm.bias": "model_00030-of-00072.safetensors",
- "h.28.input_layernorm.weight": "model_00030-of-00072.safetensors",
- "h.28.mlp.dense_4h_to_h.bias": "model_00030-of-00072.safetensors",
- "h.28.mlp.dense_4h_to_h.weight": "model_00030-of-00072.safetensors",
- "h.28.mlp.dense_h_to_4h.bias": "model_00030-of-00072.safetensors",
- "h.28.mlp.dense_h_to_4h.weight": "model_00030-of-00072.safetensors",
- "h.28.post_attention_layernorm.bias": "model_00030-of-00072.safetensors",
- "h.28.post_attention_layernorm.weight": "model_00030-of-00072.safetensors",
- "h.28.self_attention.dense.bias": "model_00030-of-00072.safetensors",
- "h.28.self_attention.dense.weight": "model_00030-of-00072.safetensors",
- "h.28.self_attention.query_key_value.bias": "model_00030-of-00072.safetensors",
- "h.28.self_attention.query_key_value.weight": "model_00030-of-00072.safetensors",
- "h.29.input_layernorm.bias": "model_00031-of-00072.safetensors",
- "h.29.input_layernorm.weight": "model_00031-of-00072.safetensors",
- "h.29.mlp.dense_4h_to_h.bias": "model_00031-of-00072.safetensors",
- "h.29.mlp.dense_4h_to_h.weight": "model_00031-of-00072.safetensors",
- "h.29.mlp.dense_h_to_4h.bias": "model_00031-of-00072.safetensors",
- "h.29.mlp.dense_h_to_4h.weight": "model_00031-of-00072.safetensors",
- "h.29.post_attention_layernorm.bias": "model_00031-of-00072.safetensors",
- "h.29.post_attention_layernorm.weight": "model_00031-of-00072.safetensors",
- "h.29.self_attention.dense.bias": "model_00031-of-00072.safetensors",
- "h.29.self_attention.dense.weight": "model_00031-of-00072.safetensors",
- "h.29.self_attention.query_key_value.bias": "model_00031-of-00072.safetensors",
- "h.29.self_attention.query_key_value.weight": "model_00031-of-00072.safetensors",
- "h.3.input_layernorm.bias": "model_00005-of-00072.safetensors",
- "h.3.input_layernorm.weight": "model_00005-of-00072.safetensors",
- "h.3.mlp.dense_4h_to_h.bias": "model_00005-of-00072.safetensors",
- "h.3.mlp.dense_4h_to_h.weight": "model_00005-of-00072.safetensors",
- "h.3.mlp.dense_h_to_4h.bias": "model_00005-of-00072.safetensors",
- "h.3.mlp.dense_h_to_4h.weight": "model_00005-of-00072.safetensors",
- "h.3.post_attention_layernorm.bias": "model_00005-of-00072.safetensors",
- "h.3.post_attention_layernorm.weight": "model_00005-of-00072.safetensors",
- "h.3.self_attention.dense.bias": "model_00005-of-00072.safetensors",
- "h.3.self_attention.dense.weight": "model_00005-of-00072.safetensors",
- "h.3.self_attention.query_key_value.bias": "model_00005-of-00072.safetensors",
- "h.3.self_attention.query_key_value.weight": "model_00005-of-00072.safetensors",
- "h.30.input_layernorm.bias": "model_00032-of-00072.safetensors",
- "h.30.input_layernorm.weight": "model_00032-of-00072.safetensors",
- "h.30.mlp.dense_4h_to_h.bias": "model_00032-of-00072.safetensors",
- "h.30.mlp.dense_4h_to_h.weight": "model_00032-of-00072.safetensors",
- "h.30.mlp.dense_h_to_4h.bias": "model_00032-of-00072.safetensors",
- "h.30.mlp.dense_h_to_4h.weight": "model_00032-of-00072.safetensors",
- "h.30.post_attention_layernorm.bias": "model_00032-of-00072.safetensors",
- "h.30.post_attention_layernorm.weight": "model_00032-of-00072.safetensors",
- "h.30.self_attention.dense.bias": "model_00032-of-00072.safetensors",
- "h.30.self_attention.dense.weight": "model_00032-of-00072.safetensors",
- "h.30.self_attention.query_key_value.bias": "model_00032-of-00072.safetensors",
- "h.30.self_attention.query_key_value.weight": "model_00032-of-00072.safetensors",
- "h.31.input_layernorm.bias": "model_00033-of-00072.safetensors",
- "h.31.input_layernorm.weight": "model_00033-of-00072.safetensors",
- "h.31.mlp.dense_4h_to_h.bias": "model_00033-of-00072.safetensors",
- "h.31.mlp.dense_4h_to_h.weight": "model_00033-of-00072.safetensors",
- "h.31.mlp.dense_h_to_4h.bias": "model_00033-of-00072.safetensors",
- "h.31.mlp.dense_h_to_4h.weight": "model_00033-of-00072.safetensors",
- "h.31.post_attention_layernorm.bias": "model_00033-of-00072.safetensors",
- "h.31.post_attention_layernorm.weight": "model_00033-of-00072.safetensors",
- "h.31.self_attention.dense.bias": "model_00033-of-00072.safetensors",
- "h.31.self_attention.dense.weight": "model_00033-of-00072.safetensors",
- "h.31.self_attention.query_key_value.bias": "model_00033-of-00072.safetensors",
- "h.31.self_attention.query_key_value.weight": "model_00033-of-00072.safetensors",
- "h.32.input_layernorm.bias": "model_00034-of-00072.safetensors",
- "h.32.input_layernorm.weight": "model_00034-of-00072.safetensors",
- "h.32.mlp.dense_4h_to_h.bias": "model_00034-of-00072.safetensors",
- "h.32.mlp.dense_4h_to_h.weight": "model_00034-of-00072.safetensors",
- "h.32.mlp.dense_h_to_4h.bias": "model_00034-of-00072.safetensors",
- "h.32.mlp.dense_h_to_4h.weight": "model_00034-of-00072.safetensors",
- "h.32.post_attention_layernorm.bias": "model_00034-of-00072.safetensors",
- "h.32.post_attention_layernorm.weight": "model_00034-of-00072.safetensors",
- "h.32.self_attention.dense.bias": "model_00034-of-00072.safetensors",
- "h.32.self_attention.dense.weight": "model_00034-of-00072.safetensors",
- "h.32.self_attention.query_key_value.bias": "model_00034-of-00072.safetensors",
- "h.32.self_attention.query_key_value.weight": "model_00034-of-00072.safetensors",
- "h.33.input_layernorm.bias": "model_00035-of-00072.safetensors",
- "h.33.input_layernorm.weight": "model_00035-of-00072.safetensors",
- "h.33.mlp.dense_4h_to_h.bias": "model_00035-of-00072.safetensors",
- "h.33.mlp.dense_4h_to_h.weight": "model_00035-of-00072.safetensors",
- "h.33.mlp.dense_h_to_4h.bias": "model_00035-of-00072.safetensors",
- "h.33.mlp.dense_h_to_4h.weight": "model_00035-of-00072.safetensors",
- "h.33.post_attention_layernorm.bias": "model_00035-of-00072.safetensors",
- "h.33.post_attention_layernorm.weight": "model_00035-of-00072.safetensors",
- "h.33.self_attention.dense.bias": "model_00035-of-00072.safetensors",
- "h.33.self_attention.dense.weight": "model_00035-of-00072.safetensors",
- "h.33.self_attention.query_key_value.bias": "model_00035-of-00072.safetensors",
- "h.33.self_attention.query_key_value.weight": "model_00035-of-00072.safetensors",
- "h.34.input_layernorm.bias": "model_00036-of-00072.safetensors",
- "h.34.input_layernorm.weight": "model_00036-of-00072.safetensors",
- "h.34.mlp.dense_4h_to_h.bias": "model_00036-of-00072.safetensors",
- "h.34.mlp.dense_4h_to_h.weight": "model_00036-of-00072.safetensors",
- "h.34.mlp.dense_h_to_4h.bias": "model_00036-of-00072.safetensors",
- "h.34.mlp.dense_h_to_4h.weight": "model_00036-of-00072.safetensors",
- "h.34.post_attention_layernorm.bias": "model_00036-of-00072.safetensors",
- "h.34.post_attention_layernorm.weight": "model_00036-of-00072.safetensors",
- "h.34.self_attention.dense.bias": "model_00036-of-00072.safetensors",
- "h.34.self_attention.dense.weight": "model_00036-of-00072.safetensors",
- "h.34.self_attention.query_key_value.bias": "model_00036-of-00072.safetensors",
- "h.34.self_attention.query_key_value.weight": "model_00036-of-00072.safetensors",
- "h.35.input_layernorm.bias": "model_00037-of-00072.safetensors",
- "h.35.input_layernorm.weight": "model_00037-of-00072.safetensors",
- "h.35.mlp.dense_4h_to_h.bias": "model_00037-of-00072.safetensors",
- "h.35.mlp.dense_4h_to_h.weight": "model_00037-of-00072.safetensors",
- "h.35.mlp.dense_h_to_4h.bias": "model_00037-of-00072.safetensors",
- "h.35.mlp.dense_h_to_4h.weight": "model_00037-of-00072.safetensors",
- "h.35.post_attention_layernorm.bias": "model_00037-of-00072.safetensors",
- "h.35.post_attention_layernorm.weight": "model_00037-of-00072.safetensors",
- "h.35.self_attention.dense.bias": "model_00037-of-00072.safetensors",
- "h.35.self_attention.dense.weight": "model_00037-of-00072.safetensors",
- "h.35.self_attention.query_key_value.bias": "model_00037-of-00072.safetensors",
- "h.35.self_attention.query_key_value.weight": "model_00037-of-00072.safetensors",
- "h.36.input_layernorm.bias": "model_00038-of-00072.safetensors",
- "h.36.input_layernorm.weight": "model_00038-of-00072.safetensors",
- "h.36.mlp.dense_4h_to_h.bias": "model_00038-of-00072.safetensors",
- "h.36.mlp.dense_4h_to_h.weight": "model_00038-of-00072.safetensors",
- "h.36.mlp.dense_h_to_4h.bias": "model_00038-of-00072.safetensors",
- "h.36.mlp.dense_h_to_4h.weight": "model_00038-of-00072.safetensors",
- "h.36.post_attention_layernorm.bias": "model_00038-of-00072.safetensors",
- "h.36.post_attention_layernorm.weight": "model_00038-of-00072.safetensors",
- "h.36.self_attention.dense.bias": "model_00038-of-00072.safetensors",
- "h.36.self_attention.dense.weight": "model_00038-of-00072.safetensors",
- "h.36.self_attention.query_key_value.bias": "model_00038-of-00072.safetensors",
- "h.36.self_attention.query_key_value.weight": "model_00038-of-00072.safetensors",
- "h.37.input_layernorm.bias": "model_00039-of-00072.safetensors",
- "h.37.input_layernorm.weight": "model_00039-of-00072.safetensors",
- "h.37.mlp.dense_4h_to_h.bias": "model_00039-of-00072.safetensors",
- "h.37.mlp.dense_4h_to_h.weight": "model_00039-of-00072.safetensors",
- "h.37.mlp.dense_h_to_4h.bias": "model_00039-of-00072.safetensors",
- "h.37.mlp.dense_h_to_4h.weight": "model_00039-of-00072.safetensors",
- "h.37.post_attention_layernorm.bias": "model_00039-of-00072.safetensors",
- "h.37.post_attention_layernorm.weight": "model_00039-of-00072.safetensors",
- "h.37.self_attention.dense.bias": "model_00039-of-00072.safetensors",
- "h.37.self_attention.dense.weight": "model_00039-of-00072.safetensors",
- "h.37.self_attention.query_key_value.bias": "model_00039-of-00072.safetensors",
- "h.37.self_attention.query_key_value.weight": "model_00039-of-00072.safetensors",
- "h.38.input_layernorm.bias": "model_00040-of-00072.safetensors",
- "h.38.input_layernorm.weight": "model_00040-of-00072.safetensors",
- "h.38.mlp.dense_4h_to_h.bias": "model_00040-of-00072.safetensors",
- "h.38.mlp.dense_4h_to_h.weight": "model_00040-of-00072.safetensors",
- "h.38.mlp.dense_h_to_4h.bias": "model_00040-of-00072.safetensors",
- "h.38.mlp.dense_h_to_4h.weight": "model_00040-of-00072.safetensors",
- "h.38.post_attention_layernorm.bias": "model_00040-of-00072.safetensors",
- "h.38.post_attention_layernorm.weight": "model_00040-of-00072.safetensors",
- "h.38.self_attention.dense.bias": "model_00040-of-00072.safetensors",
- "h.38.self_attention.dense.weight": "model_00040-of-00072.safetensors",
- "h.38.self_attention.query_key_value.bias": "model_00040-of-00072.safetensors",
- "h.38.self_attention.query_key_value.weight": "model_00040-of-00072.safetensors",
- "h.39.input_layernorm.bias": "model_00041-of-00072.safetensors",
- "h.39.input_layernorm.weight": "model_00041-of-00072.safetensors",
- "h.39.mlp.dense_4h_to_h.bias": "model_00041-of-00072.safetensors",
- "h.39.mlp.dense_4h_to_h.weight": "model_00041-of-00072.safetensors",
- "h.39.mlp.dense_h_to_4h.bias": "model_00041-of-00072.safetensors",
- "h.39.mlp.dense_h_to_4h.weight": "model_00041-of-00072.safetensors",
- "h.39.post_attention_layernorm.bias": "model_00041-of-00072.safetensors",
- "h.39.post_attention_layernorm.weight": "model_00041-of-00072.safetensors",
- "h.39.self_attention.dense.bias": "model_00041-of-00072.safetensors",
- "h.39.self_attention.dense.weight": "model_00041-of-00072.safetensors",
- "h.39.self_attention.query_key_value.bias": "model_00041-of-00072.safetensors",
- "h.39.self_attention.query_key_value.weight": "model_00041-of-00072.safetensors",
- "h.4.input_layernorm.bias": "model_00006-of-00072.safetensors",
- "h.4.input_layernorm.weight": "model_00006-of-00072.safetensors",
- "h.4.mlp.dense_4h_to_h.bias": "model_00006-of-00072.safetensors",
- "h.4.mlp.dense_4h_to_h.weight": "model_00006-of-00072.safetensors",
- "h.4.mlp.dense_h_to_4h.bias": "model_00006-of-00072.safetensors",
- "h.4.mlp.dense_h_to_4h.weight": "model_00006-of-00072.safetensors",
- "h.4.post_attention_layernorm.bias": "model_00006-of-00072.safetensors",
- "h.4.post_attention_layernorm.weight": "model_00006-of-00072.safetensors",
- "h.4.self_attention.dense.bias": "model_00006-of-00072.safetensors",
- "h.4.self_attention.dense.weight": "model_00006-of-00072.safetensors",
- "h.4.self_attention.query_key_value.bias": "model_00006-of-00072.safetensors",
- "h.4.self_attention.query_key_value.weight": "model_00006-of-00072.safetensors",
- "h.40.input_layernorm.bias": "model_00042-of-00072.safetensors",
- "h.40.input_layernorm.weight": "model_00042-of-00072.safetensors",
- "h.40.mlp.dense_4h_to_h.bias": "model_00042-of-00072.safetensors",
- "h.40.mlp.dense_4h_to_h.weight": "model_00042-of-00072.safetensors",
- "h.40.mlp.dense_h_to_4h.bias": "model_00042-of-00072.safetensors",
- "h.40.mlp.dense_h_to_4h.weight": "model_00042-of-00072.safetensors",
- "h.40.post_attention_layernorm.bias": "model_00042-of-00072.safetensors",
- "h.40.post_attention_layernorm.weight": "model_00042-of-00072.safetensors",
- "h.40.self_attention.dense.bias": "model_00042-of-00072.safetensors",
- "h.40.self_attention.dense.weight": "model_00042-of-00072.safetensors",
- "h.40.self_attention.query_key_value.bias": "model_00042-of-00072.safetensors",
- "h.40.self_attention.query_key_value.weight": "model_00042-of-00072.safetensors",
- "h.41.input_layernorm.bias": "model_00043-of-00072.safetensors",
- "h.41.input_layernorm.weight": "model_00043-of-00072.safetensors",
- "h.41.mlp.dense_4h_to_h.bias": "model_00043-of-00072.safetensors",
- "h.41.mlp.dense_4h_to_h.weight": "model_00043-of-00072.safetensors",
- "h.41.mlp.dense_h_to_4h.bias": "model_00043-of-00072.safetensors",
- "h.41.mlp.dense_h_to_4h.weight": "model_00043-of-00072.safetensors",
- "h.41.post_attention_layernorm.bias": "model_00043-of-00072.safetensors",
- "h.41.post_attention_layernorm.weight": "model_00043-of-00072.safetensors",
- "h.41.self_attention.dense.bias": "model_00043-of-00072.safetensors",
- "h.41.self_attention.dense.weight": "model_00043-of-00072.safetensors",
- "h.41.self_attention.query_key_value.bias": "model_00043-of-00072.safetensors",
- "h.41.self_attention.query_key_value.weight": "model_00043-of-00072.safetensors",
- "h.42.input_layernorm.bias": "model_00044-of-00072.safetensors",
- "h.42.input_layernorm.weight": "model_00044-of-00072.safetensors",
- "h.42.mlp.dense_4h_to_h.bias": "model_00044-of-00072.safetensors",
- "h.42.mlp.dense_4h_to_h.weight": "model_00044-of-00072.safetensors",
- "h.42.mlp.dense_h_to_4h.bias": "model_00044-of-00072.safetensors",
- "h.42.mlp.dense_h_to_4h.weight": "model_00044-of-00072.safetensors",
- "h.42.post_attention_layernorm.bias": "model_00044-of-00072.safetensors",
- "h.42.post_attention_layernorm.weight": "model_00044-of-00072.safetensors",
- "h.42.self_attention.dense.bias": "model_00044-of-00072.safetensors",
- "h.42.self_attention.dense.weight": "model_00044-of-00072.safetensors",
- "h.42.self_attention.query_key_value.bias": "model_00044-of-00072.safetensors",
- "h.42.self_attention.query_key_value.weight": "model_00044-of-00072.safetensors",
- "h.43.input_layernorm.bias": "model_00045-of-00072.safetensors",
- "h.43.input_layernorm.weight": "model_00045-of-00072.safetensors",
- "h.43.mlp.dense_4h_to_h.bias": "model_00045-of-00072.safetensors",
- "h.43.mlp.dense_4h_to_h.weight": "model_00045-of-00072.safetensors",
- "h.43.mlp.dense_h_to_4h.bias": "model_00045-of-00072.safetensors",
- "h.43.mlp.dense_h_to_4h.weight": "model_00045-of-00072.safetensors",
- "h.43.post_attention_layernorm.bias": "model_00045-of-00072.safetensors",
- "h.43.post_attention_layernorm.weight": "model_00045-of-00072.safetensors",
- "h.43.self_attention.dense.bias": "model_00045-of-00072.safetensors",
- "h.43.self_attention.dense.weight": "model_00045-of-00072.safetensors",
- "h.43.self_attention.query_key_value.bias": "model_00045-of-00072.safetensors",
- "h.43.self_attention.query_key_value.weight": "model_00045-of-00072.safetensors",
- "h.44.input_layernorm.bias": "model_00046-of-00072.safetensors",
- "h.44.input_layernorm.weight": "model_00046-of-00072.safetensors",
- "h.44.mlp.dense_4h_to_h.bias": "model_00046-of-00072.safetensors",
- "h.44.mlp.dense_4h_to_h.weight": "model_00046-of-00072.safetensors",
- "h.44.mlp.dense_h_to_4h.bias": "model_00046-of-00072.safetensors",
- "h.44.mlp.dense_h_to_4h.weight": "model_00046-of-00072.safetensors",
- "h.44.post_attention_layernorm.bias": "model_00046-of-00072.safetensors",
- "h.44.post_attention_layernorm.weight": "model_00046-of-00072.safetensors",
- "h.44.self_attention.dense.bias": "model_00046-of-00072.safetensors",
- "h.44.self_attention.dense.weight": "model_00046-of-00072.safetensors",
- "h.44.self_attention.query_key_value.bias": "model_00046-of-00072.safetensors",
- "h.44.self_attention.query_key_value.weight": "model_00046-of-00072.safetensors",
- "h.45.input_layernorm.bias": "model_00047-of-00072.safetensors",
- "h.45.input_layernorm.weight": "model_00047-of-00072.safetensors",
- "h.45.mlp.dense_4h_to_h.bias": "model_00047-of-00072.safetensors",
- "h.45.mlp.dense_4h_to_h.weight": "model_00047-of-00072.safetensors",
- "h.45.mlp.dense_h_to_4h.bias": "model_00047-of-00072.safetensors",
- "h.45.mlp.dense_h_to_4h.weight": "model_00047-of-00072.safetensors",
- "h.45.post_attention_layernorm.bias": "model_00047-of-00072.safetensors",
- "h.45.post_attention_layernorm.weight": "model_00047-of-00072.safetensors",
- "h.45.self_attention.dense.bias": "model_00047-of-00072.safetensors",
- "h.45.self_attention.dense.weight": "model_00047-of-00072.safetensors",
- "h.45.self_attention.query_key_value.bias": "model_00047-of-00072.safetensors",
- "h.45.self_attention.query_key_value.weight": "model_00047-of-00072.safetensors",
- "h.46.input_layernorm.bias": "model_00048-of-00072.safetensors",
- "h.46.input_layernorm.weight": "model_00048-of-00072.safetensors",
- "h.46.mlp.dense_4h_to_h.bias": "model_00048-of-00072.safetensors",
- "h.46.mlp.dense_4h_to_h.weight": "model_00048-of-00072.safetensors",
- "h.46.mlp.dense_h_to_4h.bias": "model_00048-of-00072.safetensors",
- "h.46.mlp.dense_h_to_4h.weight": "model_00048-of-00072.safetensors",
- "h.46.post_attention_layernorm.bias": "model_00048-of-00072.safetensors",
- "h.46.post_attention_layernorm.weight": "model_00048-of-00072.safetensors",
- "h.46.self_attention.dense.bias": "model_00048-of-00072.safetensors",
- "h.46.self_attention.dense.weight": "model_00048-of-00072.safetensors",
- "h.46.self_attention.query_key_value.bias": "model_00048-of-00072.safetensors",
- "h.46.self_attention.query_key_value.weight": "model_00048-of-00072.safetensors",
- "h.47.input_layernorm.bias": "model_00049-of-00072.safetensors",
- "h.47.input_layernorm.weight": "model_00049-of-00072.safetensors",
- "h.47.mlp.dense_4h_to_h.bias": "model_00049-of-00072.safetensors",
- "h.47.mlp.dense_4h_to_h.weight": "model_00049-of-00072.safetensors",
- "h.47.mlp.dense_h_to_4h.bias": "model_00049-of-00072.safetensors",
- "h.47.mlp.dense_h_to_4h.weight": "model_00049-of-00072.safetensors",
- "h.47.post_attention_layernorm.bias": "model_00049-of-00072.safetensors",
- "h.47.post_attention_layernorm.weight": "model_00049-of-00072.safetensors",
- "h.47.self_attention.dense.bias": "model_00049-of-00072.safetensors",
- "h.47.self_attention.dense.weight": "model_00049-of-00072.safetensors",
- "h.47.self_attention.query_key_value.bias": "model_00049-of-00072.safetensors",
- "h.47.self_attention.query_key_value.weight": "model_00049-of-00072.safetensors",
- "h.48.input_layernorm.bias": "model_00050-of-00072.safetensors",
- "h.48.input_layernorm.weight": "model_00050-of-00072.safetensors",
- "h.48.mlp.dense_4h_to_h.bias": "model_00050-of-00072.safetensors",
- "h.48.mlp.dense_4h_to_h.weight": "model_00050-of-00072.safetensors",
- "h.48.mlp.dense_h_to_4h.bias": "model_00050-of-00072.safetensors",
- "h.48.mlp.dense_h_to_4h.weight": "model_00050-of-00072.safetensors",
- "h.48.post_attention_layernorm.bias": "model_00050-of-00072.safetensors",
- "h.48.post_attention_layernorm.weight": "model_00050-of-00072.safetensors",
- "h.48.self_attention.dense.bias": "model_00050-of-00072.safetensors",
- "h.48.self_attention.dense.weight": "model_00050-of-00072.safetensors",
- "h.48.self_attention.query_key_value.bias": "model_00050-of-00072.safetensors",
- "h.48.self_attention.query_key_value.weight": "model_00050-of-00072.safetensors",
- "h.49.input_layernorm.bias": "model_00051-of-00072.safetensors",
- "h.49.input_layernorm.weight": "model_00051-of-00072.safetensors",
- "h.49.mlp.dense_4h_to_h.bias": "model_00051-of-00072.safetensors",
- "h.49.mlp.dense_4h_to_h.weight": "model_00051-of-00072.safetensors",
- "h.49.mlp.dense_h_to_4h.bias": "model_00051-of-00072.safetensors",
- "h.49.mlp.dense_h_to_4h.weight": "model_00051-of-00072.safetensors",
- "h.49.post_attention_layernorm.bias": "model_00051-of-00072.safetensors",
- "h.49.post_attention_layernorm.weight": "model_00051-of-00072.safetensors",
- "h.49.self_attention.dense.bias": "model_00051-of-00072.safetensors",
- "h.49.self_attention.dense.weight": "model_00051-of-00072.safetensors",
- "h.49.self_attention.query_key_value.bias": "model_00051-of-00072.safetensors",
- "h.49.self_attention.query_key_value.weight": "model_00051-of-00072.safetensors",
- "h.5.input_layernorm.bias": "model_00007-of-00072.safetensors",
- "h.5.input_layernorm.weight": "model_00007-of-00072.safetensors",
- "h.5.mlp.dense_4h_to_h.bias": "model_00007-of-00072.safetensors",
- "h.5.mlp.dense_4h_to_h.weight": "model_00007-of-00072.safetensors",
- "h.5.mlp.dense_h_to_4h.bias": "model_00007-of-00072.safetensors",
- "h.5.mlp.dense_h_to_4h.weight": "model_00007-of-00072.safetensors",
- "h.5.post_attention_layernorm.bias": "model_00007-of-00072.safetensors",
- "h.5.post_attention_layernorm.weight": "model_00007-of-00072.safetensors",
- "h.5.self_attention.dense.bias": "model_00007-of-00072.safetensors",
- "h.5.self_attention.dense.weight": "model_00007-of-00072.safetensors",
- "h.5.self_attention.query_key_value.bias": "model_00007-of-00072.safetensors",
- "h.5.self_attention.query_key_value.weight": "model_00007-of-00072.safetensors",
- "h.50.input_layernorm.bias": "model_00052-of-00072.safetensors",
- "h.50.input_layernorm.weight": "model_00052-of-00072.safetensors",
- "h.50.mlp.dense_4h_to_h.bias": "model_00052-of-00072.safetensors",
- "h.50.mlp.dense_4h_to_h.weight": "model_00052-of-00072.safetensors",
- "h.50.mlp.dense_h_to_4h.bias": "model_00052-of-00072.safetensors",
- "h.50.mlp.dense_h_to_4h.weight": "model_00052-of-00072.safetensors",
- "h.50.post_attention_layernorm.bias": "model_00052-of-00072.safetensors",
- "h.50.post_attention_layernorm.weight": "model_00052-of-00072.safetensors",
- "h.50.self_attention.dense.bias": "model_00052-of-00072.safetensors",
- "h.50.self_attention.dense.weight": "model_00052-of-00072.safetensors",
- "h.50.self_attention.query_key_value.bias": "model_00052-of-00072.safetensors",
- "h.50.self_attention.query_key_value.weight": "model_00052-of-00072.safetensors",
- "h.51.input_layernorm.bias": "model_00053-of-00072.safetensors",
- "h.51.input_layernorm.weight": "model_00053-of-00072.safetensors",
- "h.51.mlp.dense_4h_to_h.bias": "model_00053-of-00072.safetensors",
- "h.51.mlp.dense_4h_to_h.weight": "model_00053-of-00072.safetensors",
- "h.51.mlp.dense_h_to_4h.bias": "model_00053-of-00072.safetensors",
- "h.51.mlp.dense_h_to_4h.weight": "model_00053-of-00072.safetensors",
- "h.51.post_attention_layernorm.bias": "model_00053-of-00072.safetensors",
- "h.51.post_attention_layernorm.weight": "model_00053-of-00072.safetensors",
- "h.51.self_attention.dense.bias": "model_00053-of-00072.safetensors",
- "h.51.self_attention.dense.weight": "model_00053-of-00072.safetensors",
- "h.51.self_attention.query_key_value.bias": "model_00053-of-00072.safetensors",
- "h.51.self_attention.query_key_value.weight": "model_00053-of-00072.safetensors",
- "h.52.input_layernorm.bias": "model_00054-of-00072.safetensors",
- "h.52.input_layernorm.weight": "model_00054-of-00072.safetensors",
- "h.52.mlp.dense_4h_to_h.bias": "model_00054-of-00072.safetensors",
- "h.52.mlp.dense_4h_to_h.weight": "model_00054-of-00072.safetensors",
- "h.52.mlp.dense_h_to_4h.bias": "model_00054-of-00072.safetensors",
- "h.52.mlp.dense_h_to_4h.weight": "model_00054-of-00072.safetensors",
- "h.52.post_attention_layernorm.bias": "model_00054-of-00072.safetensors",
- "h.52.post_attention_layernorm.weight": "model_00054-of-00072.safetensors",
- "h.52.self_attention.dense.bias": "model_00054-of-00072.safetensors",
- "h.52.self_attention.dense.weight": "model_00054-of-00072.safetensors",
- "h.52.self_attention.query_key_value.bias": "model_00054-of-00072.safetensors",
- "h.52.self_attention.query_key_value.weight": "model_00054-of-00072.safetensors",
- "h.53.input_layernorm.bias": "model_00055-of-00072.safetensors",
- "h.53.input_layernorm.weight": "model_00055-of-00072.safetensors",
- "h.53.mlp.dense_4h_to_h.bias": "model_00055-of-00072.safetensors",
- "h.53.mlp.dense_4h_to_h.weight": "model_00055-of-00072.safetensors",
- "h.53.mlp.dense_h_to_4h.bias": "model_00055-of-00072.safetensors",
- "h.53.mlp.dense_h_to_4h.weight": "model_00055-of-00072.safetensors",
- "h.53.post_attention_layernorm.bias": "model_00055-of-00072.safetensors",
- "h.53.post_attention_layernorm.weight": "model_00055-of-00072.safetensors",
- "h.53.self_attention.dense.bias": "model_00055-of-00072.safetensors",
- "h.53.self_attention.dense.weight": "model_00055-of-00072.safetensors",
- "h.53.self_attention.query_key_value.bias": "model_00055-of-00072.safetensors",
- "h.53.self_attention.query_key_value.weight": "model_00055-of-00072.safetensors",
- "h.54.input_layernorm.bias": "model_00056-of-00072.safetensors",
- "h.54.input_layernorm.weight": "model_00056-of-00072.safetensors",
- "h.54.mlp.dense_4h_to_h.bias": "model_00056-of-00072.safetensors",
- "h.54.mlp.dense_4h_to_h.weight": "model_00056-of-00072.safetensors",
- "h.54.mlp.dense_h_to_4h.bias": "model_00056-of-00072.safetensors",
- "h.54.mlp.dense_h_to_4h.weight": "model_00056-of-00072.safetensors",
- "h.54.post_attention_layernorm.bias": "model_00056-of-00072.safetensors",
- "h.54.post_attention_layernorm.weight": "model_00056-of-00072.safetensors",
- "h.54.self_attention.dense.bias": "model_00056-of-00072.safetensors",
- "h.54.self_attention.dense.weight": "model_00056-of-00072.safetensors",
- "h.54.self_attention.query_key_value.bias": "model_00056-of-00072.safetensors",
- "h.54.self_attention.query_key_value.weight": "model_00056-of-00072.safetensors",
- "h.55.input_layernorm.bias": "model_00057-of-00072.safetensors",
- "h.55.input_layernorm.weight": "model_00057-of-00072.safetensors",
- "h.55.mlp.dense_4h_to_h.bias": "model_00057-of-00072.safetensors",
- "h.55.mlp.dense_4h_to_h.weight": "model_00057-of-00072.safetensors",
- "h.55.mlp.dense_h_to_4h.bias": "model_00057-of-00072.safetensors",
- "h.55.mlp.dense_h_to_4h.weight": "model_00057-of-00072.safetensors",
- "h.55.post_attention_layernorm.bias": "model_00057-of-00072.safetensors",
- "h.55.post_attention_layernorm.weight": "model_00057-of-00072.safetensors",
- "h.55.self_attention.dense.bias": "model_00057-of-00072.safetensors",
- "h.55.self_attention.dense.weight": "model_00057-of-00072.safetensors",
- "h.55.self_attention.query_key_value.bias": "model_00057-of-00072.safetensors",
- "h.55.self_attention.query_key_value.weight": "model_00057-of-00072.safetensors",
- "h.56.input_layernorm.bias": "model_00058-of-00072.safetensors",
- "h.56.input_layernorm.weight": "model_00058-of-00072.safetensors",
- "h.56.mlp.dense_4h_to_h.bias": "model_00058-of-00072.safetensors",
- "h.56.mlp.dense_4h_to_h.weight": "model_00058-of-00072.safetensors",
- "h.56.mlp.dense_h_to_4h.bias": "model_00058-of-00072.safetensors",
- "h.56.mlp.dense_h_to_4h.weight": "model_00058-of-00072.safetensors",
- "h.56.post_attention_layernorm.bias": "model_00058-of-00072.safetensors",
- "h.56.post_attention_layernorm.weight": "model_00058-of-00072.safetensors",
- "h.56.self_attention.dense.bias": "model_00058-of-00072.safetensors",
- "h.56.self_attention.dense.weight": "model_00058-of-00072.safetensors",
- "h.56.self_attention.query_key_value.bias": "model_00058-of-00072.safetensors",
- "h.56.self_attention.query_key_value.weight": "model_00058-of-00072.safetensors",
- "h.57.input_layernorm.bias": "model_00059-of-00072.safetensors",
- "h.57.input_layernorm.weight": "model_00059-of-00072.safetensors",
- "h.57.mlp.dense_4h_to_h.bias": "model_00059-of-00072.safetensors",
- "h.57.mlp.dense_4h_to_h.weight": "model_00059-of-00072.safetensors",
- "h.57.mlp.dense_h_to_4h.bias": "model_00059-of-00072.safetensors",
- "h.57.mlp.dense_h_to_4h.weight": "model_00059-of-00072.safetensors",
- "h.57.post_attention_layernorm.bias": "model_00059-of-00072.safetensors",
- "h.57.post_attention_layernorm.weight": "model_00059-of-00072.safetensors",
- "h.57.self_attention.dense.bias": "model_00059-of-00072.safetensors",
- "h.57.self_attention.dense.weight": "model_00059-of-00072.safetensors",
- "h.57.self_attention.query_key_value.bias": "model_00059-of-00072.safetensors",
- "h.57.self_attention.query_key_value.weight": "model_00059-of-00072.safetensors",
- "h.58.input_layernorm.bias": "model_00060-of-00072.safetensors",
- "h.58.input_layernorm.weight": "model_00060-of-00072.safetensors",
- "h.58.mlp.dense_4h_to_h.bias": "model_00060-of-00072.safetensors",
- "h.58.mlp.dense_4h_to_h.weight": "model_00060-of-00072.safetensors",
- "h.58.mlp.dense_h_to_4h.bias": "model_00060-of-00072.safetensors",
- "h.58.mlp.dense_h_to_4h.weight": "model_00060-of-00072.safetensors",
- "h.58.post_attention_layernorm.bias": "model_00060-of-00072.safetensors",
- "h.58.post_attention_layernorm.weight": "model_00060-of-00072.safetensors",
- "h.58.self_attention.dense.bias": "model_00060-of-00072.safetensors",
- "h.58.self_attention.dense.weight": "model_00060-of-00072.safetensors",
- "h.58.self_attention.query_key_value.bias": "model_00060-of-00072.safetensors",
- "h.58.self_attention.query_key_value.weight": "model_00060-of-00072.safetensors",
- "h.59.input_layernorm.bias": "model_00061-of-00072.safetensors",
- "h.59.input_layernorm.weight": "model_00061-of-00072.safetensors",
- "h.59.mlp.dense_4h_to_h.bias": "model_00061-of-00072.safetensors",
- "h.59.mlp.dense_4h_to_h.weight": "model_00061-of-00072.safetensors",
- "h.59.mlp.dense_h_to_4h.bias": "model_00061-of-00072.safetensors",
- "h.59.mlp.dense_h_to_4h.weight": "model_00061-of-00072.safetensors",
- "h.59.post_attention_layernorm.bias": "model_00061-of-00072.safetensors",
- "h.59.post_attention_layernorm.weight": "model_00061-of-00072.safetensors",
- "h.59.self_attention.dense.bias": "model_00061-of-00072.safetensors",
- "h.59.self_attention.dense.weight": "model_00061-of-00072.safetensors",
- "h.59.self_attention.query_key_value.bias": "model_00061-of-00072.safetensors",
- "h.59.self_attention.query_key_value.weight": "model_00061-of-00072.safetensors",
- "h.6.input_layernorm.bias": "model_00008-of-00072.safetensors",
- "h.6.input_layernorm.weight": "model_00008-of-00072.safetensors",
- "h.6.mlp.dense_4h_to_h.bias": "model_00008-of-00072.safetensors",
- "h.6.mlp.dense_4h_to_h.weight": "model_00008-of-00072.safetensors",
- "h.6.mlp.dense_h_to_4h.bias": "model_00008-of-00072.safetensors",
- "h.6.mlp.dense_h_to_4h.weight": "model_00008-of-00072.safetensors",
- "h.6.post_attention_layernorm.bias": "model_00008-of-00072.safetensors",
- "h.6.post_attention_layernorm.weight": "model_00008-of-00072.safetensors",
- "h.6.self_attention.dense.bias": "model_00008-of-00072.safetensors",
- "h.6.self_attention.dense.weight": "model_00008-of-00072.safetensors",
- "h.6.self_attention.query_key_value.bias": "model_00008-of-00072.safetensors",
- "h.6.self_attention.query_key_value.weight": "model_00008-of-00072.safetensors",
- "h.60.input_layernorm.bias": "model_00062-of-00072.safetensors",
- "h.60.input_layernorm.weight": "model_00062-of-00072.safetensors",
- "h.60.mlp.dense_4h_to_h.bias": "model_00062-of-00072.safetensors",
- "h.60.mlp.dense_4h_to_h.weight": "model_00062-of-00072.safetensors",
- "h.60.mlp.dense_h_to_4h.bias": "model_00062-of-00072.safetensors",
- "h.60.mlp.dense_h_to_4h.weight": "model_00062-of-00072.safetensors",
- "h.60.post_attention_layernorm.bias": "model_00062-of-00072.safetensors",
- "h.60.post_attention_layernorm.weight": "model_00062-of-00072.safetensors",
- "h.60.self_attention.dense.bias": "model_00062-of-00072.safetensors",
- "h.60.self_attention.dense.weight": "model_00062-of-00072.safetensors",
- "h.60.self_attention.query_key_value.bias": "model_00062-of-00072.safetensors",
- "h.60.self_attention.query_key_value.weight": "model_00062-of-00072.safetensors",
- "h.61.input_layernorm.bias": "model_00063-of-00072.safetensors",
- "h.61.input_layernorm.weight": "model_00063-of-00072.safetensors",
- "h.61.mlp.dense_4h_to_h.bias": "model_00063-of-00072.safetensors",
- "h.61.mlp.dense_4h_to_h.weight": "model_00063-of-00072.safetensors",
- "h.61.mlp.dense_h_to_4h.bias": "model_00063-of-00072.safetensors",
- "h.61.mlp.dense_h_to_4h.weight": "model_00063-of-00072.safetensors",
- "h.61.post_attention_layernorm.bias": "model_00063-of-00072.safetensors",
- "h.61.post_attention_layernorm.weight": "model_00063-of-00072.safetensors",
- "h.61.self_attention.dense.bias": "model_00063-of-00072.safetensors",
- "h.61.self_attention.dense.weight": "model_00063-of-00072.safetensors",
- "h.61.self_attention.query_key_value.bias": "model_00063-of-00072.safetensors",
- "h.61.self_attention.query_key_value.weight": "model_00063-of-00072.safetensors",
- "h.62.input_layernorm.bias": "model_00064-of-00072.safetensors",
- "h.62.input_layernorm.weight": "model_00064-of-00072.safetensors",
- "h.62.mlp.dense_4h_to_h.bias": "model_00064-of-00072.safetensors",
- "h.62.mlp.dense_4h_to_h.weight": "model_00064-of-00072.safetensors",
- "h.62.mlp.dense_h_to_4h.bias": "model_00064-of-00072.safetensors",
- "h.62.mlp.dense_h_to_4h.weight": "model_00064-of-00072.safetensors",
- "h.62.post_attention_layernorm.bias": "model_00064-of-00072.safetensors",
- "h.62.post_attention_layernorm.weight": "model_00064-of-00072.safetensors",
- "h.62.self_attention.dense.bias": "model_00064-of-00072.safetensors",
- "h.62.self_attention.dense.weight": "model_00064-of-00072.safetensors",
- "h.62.self_attention.query_key_value.bias": "model_00064-of-00072.safetensors",
- "h.62.self_attention.query_key_value.weight": "model_00064-of-00072.safetensors",
- "h.63.input_layernorm.bias": "model_00065-of-00072.safetensors",
- "h.63.input_layernorm.weight": "model_00065-of-00072.safetensors",
- "h.63.mlp.dense_4h_to_h.bias": "model_00065-of-00072.safetensors",
- "h.63.mlp.dense_4h_to_h.weight": "model_00065-of-00072.safetensors",
- "h.63.mlp.dense_h_to_4h.bias": "model_00065-of-00072.safetensors",
- "h.63.mlp.dense_h_to_4h.weight": "model_00065-of-00072.safetensors",
- "h.63.post_attention_layernorm.bias": "model_00065-of-00072.safetensors",
- "h.63.post_attention_layernorm.weight": "model_00065-of-00072.safetensors",
- "h.63.self_attention.dense.bias": "model_00065-of-00072.safetensors",
- "h.63.self_attention.dense.weight": "model_00065-of-00072.safetensors",
- "h.63.self_attention.query_key_value.bias": "model_00065-of-00072.safetensors",
- "h.63.self_attention.query_key_value.weight": "model_00065-of-00072.safetensors",
- "h.64.input_layernorm.bias": "model_00066-of-00072.safetensors",
- "h.64.input_layernorm.weight": "model_00066-of-00072.safetensors",
- "h.64.mlp.dense_4h_to_h.bias": "model_00066-of-00072.safetensors",
- "h.64.mlp.dense_4h_to_h.weight": "model_00066-of-00072.safetensors",
- "h.64.mlp.dense_h_to_4h.bias": "model_00066-of-00072.safetensors",
- "h.64.mlp.dense_h_to_4h.weight": "model_00066-of-00072.safetensors",
- "h.64.post_attention_layernorm.bias": "model_00066-of-00072.safetensors",
- "h.64.post_attention_layernorm.weight": "model_00066-of-00072.safetensors",
- "h.64.self_attention.dense.bias": "model_00066-of-00072.safetensors",
- "h.64.self_attention.dense.weight": "model_00066-of-00072.safetensors",
- "h.64.self_attention.query_key_value.bias": "model_00066-of-00072.safetensors",
- "h.64.self_attention.query_key_value.weight": "model_00066-of-00072.safetensors",
- "h.65.input_layernorm.bias": "model_00067-of-00072.safetensors",
- "h.65.input_layernorm.weight": "model_00067-of-00072.safetensors",
- "h.65.mlp.dense_4h_to_h.bias": "model_00067-of-00072.safetensors",
- "h.65.mlp.dense_4h_to_h.weight": "model_00067-of-00072.safetensors",
- "h.65.mlp.dense_h_to_4h.bias": "model_00067-of-00072.safetensors",
- "h.65.mlp.dense_h_to_4h.weight": "model_00067-of-00072.safetensors",
- "h.65.post_attention_layernorm.bias": "model_00067-of-00072.safetensors",
- "h.65.post_attention_layernorm.weight": "model_00067-of-00072.safetensors",
- "h.65.self_attention.dense.bias": "model_00067-of-00072.safetensors",
- "h.65.self_attention.dense.weight": "model_00067-of-00072.safetensors",
- "h.65.self_attention.query_key_value.bias": "model_00067-of-00072.safetensors",
- "h.65.self_attention.query_key_value.weight": "model_00067-of-00072.safetensors",
- "h.66.input_layernorm.bias": "model_00068-of-00072.safetensors",
- "h.66.input_layernorm.weight": "model_00068-of-00072.safetensors",
- "h.66.mlp.dense_4h_to_h.bias": "model_00068-of-00072.safetensors",
- "h.66.mlp.dense_4h_to_h.weight": "model_00068-of-00072.safetensors",
- "h.66.mlp.dense_h_to_4h.bias": "model_00068-of-00072.safetensors",
- "h.66.mlp.dense_h_to_4h.weight": "model_00068-of-00072.safetensors",
- "h.66.post_attention_layernorm.bias": "model_00068-of-00072.safetensors",
- "h.66.post_attention_layernorm.weight": "model_00068-of-00072.safetensors",
- "h.66.self_attention.dense.bias": "model_00068-of-00072.safetensors",
- "h.66.self_attention.dense.weight": "model_00068-of-00072.safetensors",
- "h.66.self_attention.query_key_value.bias": "model_00068-of-00072.safetensors",
- "h.66.self_attention.query_key_value.weight": "model_00068-of-00072.safetensors",
- "h.67.input_layernorm.bias": "model_00069-of-00072.safetensors",
- "h.67.input_layernorm.weight": "model_00069-of-00072.safetensors",
- "h.67.mlp.dense_4h_to_h.bias": "model_00069-of-00072.safetensors",
- "h.67.mlp.dense_4h_to_h.weight": "model_00069-of-00072.safetensors",
- "h.67.mlp.dense_h_to_4h.bias": "model_00069-of-00072.safetensors",
- "h.67.mlp.dense_h_to_4h.weight": "model_00069-of-00072.safetensors",
- "h.67.post_attention_layernorm.bias": "model_00069-of-00072.safetensors",
- "h.67.post_attention_layernorm.weight": "model_00069-of-00072.safetensors",
- "h.67.self_attention.dense.bias": "model_00069-of-00072.safetensors",
- "h.67.self_attention.dense.weight": "model_00069-of-00072.safetensors",
- "h.67.self_attention.query_key_value.bias": "model_00069-of-00072.safetensors",
- "h.67.self_attention.query_key_value.weight": "model_00069-of-00072.safetensors",
- "h.68.input_layernorm.bias": "model_00070-of-00072.safetensors",
- "h.68.input_layernorm.weight": "model_00070-of-00072.safetensors",
- "h.68.mlp.dense_4h_to_h.bias": "model_00070-of-00072.safetensors",
- "h.68.mlp.dense_4h_to_h.weight": "model_00070-of-00072.safetensors",
- "h.68.mlp.dense_h_to_4h.bias": "model_00070-of-00072.safetensors",
- "h.68.mlp.dense_h_to_4h.weight": "model_00070-of-00072.safetensors",
- "h.68.post_attention_layernorm.bias": "model_00070-of-00072.safetensors",
- "h.68.post_attention_layernorm.weight": "model_00070-of-00072.safetensors",
- "h.68.self_attention.dense.bias": "model_00070-of-00072.safetensors",
- "h.68.self_attention.dense.weight": "model_00070-of-00072.safetensors",
- "h.68.self_attention.query_key_value.bias": "model_00070-of-00072.safetensors",
- "h.68.self_attention.query_key_value.weight": "model_00070-of-00072.safetensors",
- "h.69.input_layernorm.bias": "model_00071-of-00072.safetensors",
- "h.69.input_layernorm.weight": "model_00071-of-00072.safetensors",
- "h.69.mlp.dense_4h_to_h.bias": "model_00071-of-00072.safetensors",
- "h.69.mlp.dense_4h_to_h.weight": "model_00071-of-00072.safetensors",
- "h.69.mlp.dense_h_to_4h.bias": "model_00071-of-00072.safetensors",
- "h.69.mlp.dense_h_to_4h.weight": "model_00071-of-00072.safetensors",
- "h.69.post_attention_layernorm.bias": "model_00071-of-00072.safetensors",
- "h.69.post_attention_layernorm.weight": "model_00071-of-00072.safetensors",
- "h.69.self_attention.dense.bias": "model_00071-of-00072.safetensors",
- "h.69.self_attention.dense.weight": "model_00071-of-00072.safetensors",
- "h.69.self_attention.query_key_value.bias": "model_00071-of-00072.safetensors",
- "h.69.self_attention.query_key_value.weight": "model_00071-of-00072.safetensors",
- "h.7.input_layernorm.bias": "model_00009-of-00072.safetensors",
- "h.7.input_layernorm.weight": "model_00009-of-00072.safetensors",
- "h.7.mlp.dense_4h_to_h.bias": "model_00009-of-00072.safetensors",
- "h.7.mlp.dense_4h_to_h.weight": "model_00009-of-00072.safetensors",
- "h.7.mlp.dense_h_to_4h.bias": "model_00009-of-00072.safetensors",
- "h.7.mlp.dense_h_to_4h.weight": "model_00009-of-00072.safetensors",
- "h.7.post_attention_layernorm.bias": "model_00009-of-00072.safetensors",
- "h.7.post_attention_layernorm.weight": "model_00009-of-00072.safetensors",
- "h.7.self_attention.dense.bias": "model_00009-of-00072.safetensors",
- "h.7.self_attention.dense.weight": "model_00009-of-00072.safetensors",
- "h.7.self_attention.query_key_value.bias": "model_00009-of-00072.safetensors",
- "h.7.self_attention.query_key_value.weight": "model_00009-of-00072.safetensors",
- "h.8.input_layernorm.bias": "model_00010-of-00072.safetensors",
- "h.8.input_layernorm.weight": "model_00010-of-00072.safetensors",
- "h.8.mlp.dense_4h_to_h.bias": "model_00010-of-00072.safetensors",
- "h.8.mlp.dense_4h_to_h.weight": "model_00010-of-00072.safetensors",
- "h.8.mlp.dense_h_to_4h.bias": "model_00010-of-00072.safetensors",
- "h.8.mlp.dense_h_to_4h.weight": "model_00010-of-00072.safetensors",
- "h.8.post_attention_layernorm.bias": "model_00010-of-00072.safetensors",
- "h.8.post_attention_layernorm.weight": "model_00010-of-00072.safetensors",
- "h.8.self_attention.dense.bias": "model_00010-of-00072.safetensors",
- "h.8.self_attention.dense.weight": "model_00010-of-00072.safetensors",
- "h.8.self_attention.query_key_value.bias": "model_00010-of-00072.safetensors",
- "h.8.self_attention.query_key_value.weight": "model_00010-of-00072.safetensors",
- "h.9.input_layernorm.bias": "model_00011-of-00072.safetensors",
- "h.9.input_layernorm.weight": "model_00011-of-00072.safetensors",
- "h.9.mlp.dense_4h_to_h.bias": "model_00011-of-00072.safetensors",
- "h.9.mlp.dense_4h_to_h.weight": "model_00011-of-00072.safetensors",
- "h.9.mlp.dense_h_to_4h.bias": "model_00011-of-00072.safetensors",
- "h.9.mlp.dense_h_to_4h.weight": "model_00011-of-00072.safetensors",
- "h.9.post_attention_layernorm.bias": "model_00011-of-00072.safetensors",
- "h.9.post_attention_layernorm.weight": "model_00011-of-00072.safetensors",
- "h.9.self_attention.dense.bias": "model_00011-of-00072.safetensors",
- "h.9.self_attention.dense.weight": "model_00011-of-00072.safetensors",
- "h.9.self_attention.query_key_value.bias": "model_00011-of-00072.safetensors",
- "h.9.self_attention.query_key_value.weight": "model_00011-of-00072.safetensors",
- "ln_f.bias": "model_00072-of-00072.safetensors",
- "ln_f.weight": "model_00072-of-00072.safetensors",
- "word_embeddings.weight": "model_00001-of-00072.safetensors",
- "word_embeddings_layernorm.bias": "model_00001-of-00072.safetensors",
- "word_embeddings_layernorm.weight": "model_00001-of-00072.safetensors"
- }
-}
diff --git a/model_00001-of-00072.safetensors b/model_00001-of-00072.safetensors
deleted file mode 100644
index aed2346478cce088e440b475228d5ebb5e40863f..0000000000000000000000000000000000000000
--- a/model_00001-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:069a00c03b0397f9befb533e6b0f68072cb550d69050868d16d76450c8000357
-size 7193289031
diff --git a/model_00002-of-00072.safetensors b/model_00002-of-00072.safetensors
deleted file mode 100644
index 98a5376fea468322162ec82601ef188aa71832f7..0000000000000000000000000000000000000000
--- a/model_00002-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a59f21045cc81d2e8f72ba46ce16f0688f5f6c58e8344ebfba16ef9f09a94eb0
-size 4932875549
diff --git a/model_00003-of-00072.safetensors b/model_00003-of-00072.safetensors
deleted file mode 100644
index 4399f4adadfe882c75bd82ddbe0ce3be233f5276..0000000000000000000000000000000000000000
--- a/model_00003-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:204922e4d94bb9d9f30c5cb4fc3b35ee5fd2325c7f9840783de220d2d42766ae
-size 4932875551
diff --git a/model_00004-of-00072.safetensors b/model_00004-of-00072.safetensors
deleted file mode 100644
index 82e3d9eec347eed402bdc2b17967325c23576950..0000000000000000000000000000000000000000
--- a/model_00004-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ce3db7dcd528ec2266b9062a1082c545eaa7a6dd06631e85c962c1afd59e24ab
-size 4932875557
diff --git a/model_00005-of-00072.safetensors b/model_00005-of-00072.safetensors
deleted file mode 100644
index 9df3c3c2c684c8c081ba392faac202ff3507a9c6..0000000000000000000000000000000000000000
--- a/model_00005-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:53d8499b65f093087058f490a1d7c19104d846bda9f6625362f149102fdc1d1e
-size 4932875509
diff --git a/model_00006-of-00072.safetensors b/model_00006-of-00072.safetensors
deleted file mode 100644
index 2a0e8a9c37793d1c963c28f9967db39151131948..0000000000000000000000000000000000000000
--- a/model_00006-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:07454a1127747cf58503bd9a3e7671b3adf8d0816d289e08d7e6662f4c2e1824
-size 4932875553
diff --git a/model_00007-of-00072.safetensors b/model_00007-of-00072.safetensors
deleted file mode 100644
index fd9812ddc62dca71e3015a65e240c6ae37ab121c..0000000000000000000000000000000000000000
--- a/model_00007-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a4c991a16871f33406e8c98bf4e7da09447e4d067d3f5056d4fe16daa7a72ab9
-size 4932875551
diff --git a/model_00008-of-00072.safetensors b/model_00008-of-00072.safetensors
deleted file mode 100644
index a56e8abfc86219226cb437f0515a3e1fef00a949..0000000000000000000000000000000000000000
--- a/model_00008-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1ce5257fad9f05ed3399d1736bc0a5c097ad34b2c58e4a97d09a479194932fd2
-size 4932875519
diff --git a/model_00009-of-00072.safetensors b/model_00009-of-00072.safetensors
deleted file mode 100644
index 98fe6b57d56ff3cacb105b7912fd8c3471d5f5c1..0000000000000000000000000000000000000000
--- a/model_00009-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:003aa8c59f07ec60f4599dbd10106e01fed184454d39046e467db47c0a879409
-size 4932875551
diff --git a/model_00010-of-00072.safetensors b/model_00010-of-00072.safetensors
deleted file mode 100644
index 318fbffd648bc7ff9c1a00c38fa3ebad08e34417..0000000000000000000000000000000000000000
--- a/model_00010-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:095828fe07d6c44aa40065595da7800277b1d4079f2a966e841b314c8f3f0d9b
-size 4932875541
diff --git a/model_00011-of-00072.safetensors b/model_00011-of-00072.safetensors
deleted file mode 100644
index ab5e51d7883c292bc6d66e6a207f1b367dbbb97a..0000000000000000000000000000000000000000
--- a/model_00011-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:400a61941848f8a7960684671cd19d232ed9325336c419669012e2aadc9d9051
-size 4932875551
diff --git a/model_00012-of-00072.safetensors b/model_00012-of-00072.safetensors
deleted file mode 100644
index c351caa8250b0b1459ba46a666709cdb61170944..0000000000000000000000000000000000000000
--- a/model_00012-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0f1392fce5a77cbfe29a2d680d82cf57b96e20466ac5efc7119eccd868006443
-size 4932875573
diff --git a/model_00013-of-00072.safetensors b/model_00013-of-00072.safetensors
deleted file mode 100644
index af273759953b30559a591b3af25b778fc6893717..0000000000000000000000000000000000000000
--- a/model_00013-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2c95c6eeb6aa593cdf3fb72239b021ce43714b13d60a89c064944e91afb4999d
-size 4932875573
diff --git a/model_00014-of-00072.safetensors b/model_00014-of-00072.safetensors
deleted file mode 100644
index 3257d4ad42009aebf545b866b84e7809077e48fc..0000000000000000000000000000000000000000
--- a/model_00014-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:982a3db07d4e7f49955519ac5ef9846424e72994514e24b7501d718ffaa732cc
-size 4932875551
diff --git a/model_00015-of-00072.safetensors b/model_00015-of-00072.safetensors
deleted file mode 100644
index 963ee034ed6699381ab6ace6cddb8da0a5906252..0000000000000000000000000000000000000000
--- a/model_00015-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9c27838630d5bd45f9cff790ba2b4f998669be7c6afe94af7cb2b5eca8aab50f
-size 4932875531
diff --git a/model_00016-of-00072.safetensors b/model_00016-of-00072.safetensors
deleted file mode 100644
index 20da2b6b5dee4af991b5f1a39f612f5c50871ec7..0000000000000000000000000000000000000000
--- a/model_00016-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:47ae50301b94f38138ba06c9a1ae435e33893231464fc65937f1e702f96f1031
-size 4932875573
diff --git a/model_00017-of-00072.safetensors b/model_00017-of-00072.safetensors
deleted file mode 100644
index 89a6713bbbc4fa8f9d6b5c96a7c63be74d931d77..0000000000000000000000000000000000000000
--- a/model_00017-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:41e389e47d117b2e95afd21a9f46f2cf25bf7652d34627e005d3168bdac62c13
-size 4932875563
diff --git a/model_00018-of-00072.safetensors b/model_00018-of-00072.safetensors
deleted file mode 100644
index 591e4e5d58f095031fed31f1bb955b96ca4e3d9b..0000000000000000000000000000000000000000
--- a/model_00018-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8ff513612c05e9cacbf264d28d4e1ae789a7c0ee754b7eb06fc22fc87cc3a14d
-size 4932875573
diff --git a/model_00019-of-00072.safetensors b/model_00019-of-00072.safetensors
deleted file mode 100644
index 889ad353eee2bfb2936dfd791f208ce750414318..0000000000000000000000000000000000000000
--- a/model_00019-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:bec845aa268edd4eed7f6a158676bdf55a3b3df7a3b7a0a5ef31bb1004279ee6
-size 4932875555
diff --git a/model_00020-of-00072.safetensors b/model_00020-of-00072.safetensors
deleted file mode 100644
index 318e3462d1a27f25c9ee14f7ec9c1f5c4704fb12..0000000000000000000000000000000000000000
--- a/model_00020-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9411792e3db49c214c32fe006b2798a3040c53cf96ab81b0da1d8fcb0dddf50d
-size 4932875563
diff --git a/model_00021-of-00072.safetensors b/model_00021-of-00072.safetensors
deleted file mode 100644
index eb117827cf46b7561382e48dbf7c1593d336cb18..0000000000000000000000000000000000000000
--- a/model_00021-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c48f8df4f00f199d4cd0364b2447988d9b72a5a4a4d22cb0cafddad841c42d8b
-size 4932875553
diff --git a/model_00022-of-00072.safetensors b/model_00022-of-00072.safetensors
deleted file mode 100644
index 7934ed9edc1dbe79200eda58a1049a771bf4f42c..0000000000000000000000000000000000000000
--- a/model_00022-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:67d8af807a9d47357b39882ecdf1cd2c1c04fec49b73ee6a72ba29caf3cc4321
-size 4932875573
diff --git a/model_00023-of-00072.safetensors b/model_00023-of-00072.safetensors
deleted file mode 100644
index a5da02fd7dc4a7e0168d396ce95f917bba2066cd..0000000000000000000000000000000000000000
--- a/model_00023-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1e75d47b9b5683110da78e9497b8a634e4480ee9ccf4a27cd4911416fec16afc
-size 4932875573
diff --git a/model_00024-of-00072.safetensors b/model_00024-of-00072.safetensors
deleted file mode 100644
index 88773ef50a4e0c37442c628d4b8d56ca60f67a04..0000000000000000000000000000000000000000
--- a/model_00024-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8c63189061ad724cc29735f11bc2ab6c86385e2f8916be3019e17a79388e9c45
-size 4932875561
diff --git a/model_00025-of-00072.safetensors b/model_00025-of-00072.safetensors
deleted file mode 100644
index 45ab10e6d16e065a242b2223637930db35428175..0000000000000000000000000000000000000000
--- a/model_00025-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:aaa686ebecf614dccc73b953160ccbc29b1e63fde53821aa1408bde478e1707b
-size 4932875553
diff --git a/model_00026-of-00072.safetensors b/model_00026-of-00072.safetensors
deleted file mode 100644
index 40e0560cc41e03d17de3797c6efc428dae47951f..0000000000000000000000000000000000000000
--- a/model_00026-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:df99b77709af12a4cf78d140b0a354d89a07d01e8b4e1e5a2d51cc44c34c221d
-size 4932875563
diff --git a/model_00027-of-00072.safetensors b/model_00027-of-00072.safetensors
deleted file mode 100644
index 8365f6a43d61ed08604af18625d2705b59f103fc..0000000000000000000000000000000000000000
--- a/model_00027-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e25903c2f7fe2a310840a30466c36affc521f0d755e747b737bc4ae6b99dbed2
-size 4932875563
diff --git a/model_00028-of-00072.safetensors b/model_00028-of-00072.safetensors
deleted file mode 100644
index b34d6727824363b6d5f5ba43d67b722e10fd18b1..0000000000000000000000000000000000000000
--- a/model_00028-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7ef5dd81d468f2ca297e5f31230ec02c7810f65529dc5c51ad897ce7d734cba9
-size 4932875573
diff --git a/model_00029-of-00072.safetensors b/model_00029-of-00072.safetensors
deleted file mode 100644
index bf5116747fb812a6f9a37e623415a9d562f69356..0000000000000000000000000000000000000000
--- a/model_00029-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e79599b306d14e2a81eaa3478fd0aa92d493df310b03f494a6be44fe523e4c05
-size 4932875563
diff --git a/model_00030-of-00072.safetensors b/model_00030-of-00072.safetensors
deleted file mode 100644
index a8f724167d784688438de3e21255a1d76e5cd400..0000000000000000000000000000000000000000
--- a/model_00030-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:88d6c77f313ec50f7bc229a8979cdef707ef870ee547b5b99a6af61ac3de2710
-size 4932875531
diff --git a/model_00031-of-00072.safetensors b/model_00031-of-00072.safetensors
deleted file mode 100644
index 81e86c4b12e84a0a426d2525d6e6549c0071ddfe..0000000000000000000000000000000000000000
--- a/model_00031-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:88e905d0328f4f1e5c93c0ca2c3e5dbf513b2a4509395dc56b16aac400be1050
-size 4932875555
diff --git a/model_00032-of-00072.safetensors b/model_00032-of-00072.safetensors
deleted file mode 100644
index 0f8881d962b52b466fdd6a3bff9b23d9a8b59171..0000000000000000000000000000000000000000
--- a/model_00032-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:173da9d59ec2731fd9110e571aa1d1926ff4a3501d34a4511f006caed253d719
-size 4932875573
diff --git a/model_00033-of-00072.safetensors b/model_00033-of-00072.safetensors
deleted file mode 100644
index 8d0b77157adf4ef1b1412d72706d37d498755724..0000000000000000000000000000000000000000
--- a/model_00033-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:066778f8acabce366e4f5259223af642639a87aaf9cb6aa8b6272944a96184f5
-size 4932875573
diff --git a/model_00034-of-00072.safetensors b/model_00034-of-00072.safetensors
deleted file mode 100644
index 6a463bb72575d671c7bfc7982cf7de9acf6bed41..0000000000000000000000000000000000000000
--- a/model_00034-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6b0f29133ff93cd564ed5b0c3b90de547c0b09ab45a09eb86d7189b73df848ac
-size 4932875573
diff --git a/model_00035-of-00072.safetensors b/model_00035-of-00072.safetensors
deleted file mode 100644
index b58c7974bf9336a2892c989bb85770ed4212d831..0000000000000000000000000000000000000000
--- a/model_00035-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e85b9970d999df06ac3589084fadcf8c54abc4c52a5042e63c77c5ca2fd83b13
-size 4932875573
diff --git a/model_00036-of-00072.safetensors b/model_00036-of-00072.safetensors
deleted file mode 100644
index d8c21c8ec120e3b50ce9ca0f4652a2cac529bb35..0000000000000000000000000000000000000000
--- a/model_00036-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:29292e40478d4c3518f2a88e1b66c41bcf52b8e069ab3eadb1396bc68e4056ff
-size 4932875563
diff --git a/model_00037-of-00072.safetensors b/model_00037-of-00072.safetensors
deleted file mode 100644
index 5e03feaf9555740ea599b4b5980998d48e1cdf0a..0000000000000000000000000000000000000000
--- a/model_00037-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:23a611b2b8c6f50cb1e946f1145d25f38301a26a5e5afd8012eb3f3a4e37f96d
-size 4932875573
diff --git a/model_00038-of-00072.safetensors b/model_00038-of-00072.safetensors
deleted file mode 100644
index c461a911347f6fc1d778a216ad67c8da843a991b..0000000000000000000000000000000000000000
--- a/model_00038-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:21d97c75a684ee95a80d5f6a28bd83dc228b893b7c38b3e644b5ecd94853f6e6
-size 4932875573
diff --git a/model_00039-of-00072.safetensors b/model_00039-of-00072.safetensors
deleted file mode 100644
index 2cccadd18ba2b187437d87131835dd5feddc0d09..0000000000000000000000000000000000000000
--- a/model_00039-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5fa116a4705a58ec525ce6cd6472c2960e83e86c4242232d3e361496161c165d
-size 4932875555
diff --git a/model_00040-of-00072.safetensors b/model_00040-of-00072.safetensors
deleted file mode 100644
index c3fd6d896e6dfdddcf0690eda226de1816934ccb..0000000000000000000000000000000000000000
--- a/model_00040-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:652c454c76f90fc1d0cfc4c8efc023538495f1635fae0892b368c41d64b05daf
-size 4932875533
diff --git a/model_00041-of-00072.safetensors b/model_00041-of-00072.safetensors
deleted file mode 100644
index d74e27a648b14ad733fefb767670185955017020..0000000000000000000000000000000000000000
--- a/model_00041-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:75e3775f02853b3cf39c2d1a29d14762b53ad4264327d13077673c467822de22
-size 4932875573
diff --git a/model_00042-of-00072.safetensors b/model_00042-of-00072.safetensors
deleted file mode 100644
index f0b3179495083a224166b397f46ac2aaaadd23f4..0000000000000000000000000000000000000000
--- a/model_00042-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:fb297c192bec234a20ef1b64d8db4a5b19e6ead67c26923016b2011a3c68ef8d
-size 4932875521
diff --git a/model_00043-of-00072.safetensors b/model_00043-of-00072.safetensors
deleted file mode 100644
index 5e2a227f84c1cd76e930410a3b8398718d7b2873..0000000000000000000000000000000000000000
--- a/model_00043-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d947feb8a1521370231d89eab5502110c27678be228768759f22ea543a04e494
-size 4932875573
diff --git a/model_00044-of-00072.safetensors b/model_00044-of-00072.safetensors
deleted file mode 100644
index 6b377b6820dded289f5afc38cef58956d1ac2825..0000000000000000000000000000000000000000
--- a/model_00044-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ba1c110a9f7a5395502e2812d5334fb4f8b618ac2c19277c3b340a75ff25e8e3
-size 4932875555
diff --git a/model_00045-of-00072.safetensors b/model_00045-of-00072.safetensors
deleted file mode 100644
index b221ea7609adf600c550eb6b94d08965f2e01547..0000000000000000000000000000000000000000
--- a/model_00045-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:da3fbda96267fecb61b573f764dec3f47ae6304c007cafae20d1f938f17d0e6f
-size 4932875569
diff --git a/model_00046-of-00072.safetensors b/model_00046-of-00072.safetensors
deleted file mode 100644
index f59b3ec5f9a17bd391d507f1d9cc063b1c5f10e9..0000000000000000000000000000000000000000
--- a/model_00046-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:db5eb422b2ee78b6491179e118828075d0153f800b0a59314f102b2c7c097bff
-size 4932875563
diff --git a/model_00047-of-00072.safetensors b/model_00047-of-00072.safetensors
deleted file mode 100644
index e387c95b3d0f82166beb2bb49eb2dd1ef7a62c4c..0000000000000000000000000000000000000000
--- a/model_00047-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8c1d231f84c18ca2f88b61dff8ec3d00f4d58fc0d780a52bd66f2dd158d5c148
-size 4932875573
diff --git a/model_00048-of-00072.safetensors b/model_00048-of-00072.safetensors
deleted file mode 100644
index e30e05f7a4280981f245bf6fcd2fffa7575ff588..0000000000000000000000000000000000000000
--- a/model_00048-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:085dc3aaad679e4c80dfec734c8a5a5ef42e785938a4f7bfe0f92580ca184ee6
-size 4932875555
diff --git a/model_00049-of-00072.safetensors b/model_00049-of-00072.safetensors
deleted file mode 100644
index 28523aae8467103277ac40843210521c6beabdc1..0000000000000000000000000000000000000000
--- a/model_00049-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:72976131df3b562c0d90bfbd34d7714b8023423eb591fcb5eaa235504535daf4
-size 4932875541
diff --git a/model_00050-of-00072.safetensors b/model_00050-of-00072.safetensors
deleted file mode 100644
index a1d26094941b53cf7b93380eb7ebd63c45d0b7b3..0000000000000000000000000000000000000000
--- a/model_00050-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:695fb74e67a57ba80ebbf05799f28135012293b1192f96a6965bf070d299e9f8
-size 4932875551
diff --git a/model_00051-of-00072.safetensors b/model_00051-of-00072.safetensors
deleted file mode 100644
index 9df7f8f46ae2aeab98a18e096c87a30990ad7f6c..0000000000000000000000000000000000000000
--- a/model_00051-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a2b0d15e4276b736066b476d7ae99d2705a2da6358a2cae0bed7b157d5d1e8d9
-size 4932875573
diff --git a/model_00052-of-00072.safetensors b/model_00052-of-00072.safetensors
deleted file mode 100644
index b2a76e6a0f4d7508ddbc1f4eda7d5975c991456b..0000000000000000000000000000000000000000
--- a/model_00052-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2f3d6fca86403bef9bc5ff684f012d7766fb151f1ae7c6eba8b62fb1f2adc49b
-size 4932875549
diff --git a/model_00053-of-00072.safetensors b/model_00053-of-00072.safetensors
deleted file mode 100644
index 86a399a39ca174e61f8f86ade0ced53f5c415b99..0000000000000000000000000000000000000000
--- a/model_00053-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7cb278a233844db394e78708af7bf664f7039d480e8c4350222dddd0a230806b
-size 4932875527
diff --git a/model_00054-of-00072.safetensors b/model_00054-of-00072.safetensors
deleted file mode 100644
index e01e4a34cd268ed98ea2b71773fd0b705bd4c7a9..0000000000000000000000000000000000000000
--- a/model_00054-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e9d7dba17a7e77f204d64aff6d41975429ebcdb8272c8d1bd7e0b5b018d430d2
-size 4932875573
diff --git a/model_00055-of-00072.safetensors b/model_00055-of-00072.safetensors
deleted file mode 100644
index 3ce131b27a8e173b5f59d5b6e2f776ca17fa9371..0000000000000000000000000000000000000000
--- a/model_00055-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f08e2483bc21d6cd80436b3057fee037e4c913d8742b3f2a852a8e4a5fcec0d9
-size 4932875563
diff --git a/model_00056-of-00072.safetensors b/model_00056-of-00072.safetensors
deleted file mode 100644
index d210cb98bdc5a98bfdd51ea56b6cbb4e0b99dab4..0000000000000000000000000000000000000000
--- a/model_00056-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9fa9e44ff040c33ea4e637589223f0da948287994981e3dc1d636d3e6330fb31
-size 4932875565
diff --git a/model_00057-of-00072.safetensors b/model_00057-of-00072.safetensors
deleted file mode 100644
index b47f2ccf8bafd338366d7b75fd100b7332718e10..0000000000000000000000000000000000000000
--- a/model_00057-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6e4a0722c2be186b3c7e2b46d9fc4212b28ea3c261a4ed4f937ff74764a462de
-size 4932875573
diff --git a/model_00058-of-00072.safetensors b/model_00058-of-00072.safetensors
deleted file mode 100644
index d7db16faeea7fd3cbcc79e89e5a8562d69232cdb..0000000000000000000000000000000000000000
--- a/model_00058-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1324752aa457baabbe0199768e10f5aa126543794a36b072eae0ade4e0312598
-size 4932875573
diff --git a/model_00059-of-00072.safetensors b/model_00059-of-00072.safetensors
deleted file mode 100644
index 2a5e18920458b80a3331b967aeccd045985a0667..0000000000000000000000000000000000000000
--- a/model_00059-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1210cb5b3f55e9eaa1123c8a5da740a3f2f7441bd7c439b66edb0762b4f4a328
-size 4932875563
diff --git a/model_00060-of-00072.safetensors b/model_00060-of-00072.safetensors
deleted file mode 100644
index 65259ef2a40c487b210f3ad6d74fc7ea5412b00a..0000000000000000000000000000000000000000
--- a/model_00060-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f9621c5e1481f20ecae15709309db04883b848f3378713731a3a0584dc7d634d
-size 4932875527
diff --git a/model_00061-of-00072.safetensors b/model_00061-of-00072.safetensors
deleted file mode 100644
index e9c60c11b4b5b91d427784a89430b0a9992d1872..0000000000000000000000000000000000000000
--- a/model_00061-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f0c18975077e977ff38a4432786ed68d2b6878c71131105829c8a73e9b53e837
-size 4932875565
diff --git a/model_00062-of-00072.safetensors b/model_00062-of-00072.safetensors
deleted file mode 100644
index 783d1931f5ff99f8b0cb2e0494796c1386ffd6b4..0000000000000000000000000000000000000000
--- a/model_00062-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8ca1c626087d1ebd1feb40241289776838ab0f3a227b12554439c745de6b26bd
-size 4932875573
diff --git a/model_00063-of-00072.safetensors b/model_00063-of-00072.safetensors
deleted file mode 100644
index 5b435d2891c55ed83631bdb4b3e09cbd04512b1f..0000000000000000000000000000000000000000
--- a/model_00063-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b845dd611ed3c5cbe66522cf7945610ecef5a59f71a1af5d5482966c6140cefe
-size 4932875565
diff --git a/model_00064-of-00072.safetensors b/model_00064-of-00072.safetensors
deleted file mode 100644
index e128331d24a6e8c40edf3e5ca2107a580e709d64..0000000000000000000000000000000000000000
--- a/model_00064-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d4d330ffa91861259bd04c489cce52978e892a922620485f983694f5e40ebca0
-size 4932875537
diff --git a/model_00065-of-00072.safetensors b/model_00065-of-00072.safetensors
deleted file mode 100644
index 1de170ce16534abb15400b659088d1d0def6d70c..0000000000000000000000000000000000000000
--- a/model_00065-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:06941b3f42871c22592098196d067f95ec994c40bee1db36a2df25661ba111fe
-size 4932875571
diff --git a/model_00066-of-00072.safetensors b/model_00066-of-00072.safetensors
deleted file mode 100644
index 5017f51c14c7ed398ade7a5721cded9e4cff99bf..0000000000000000000000000000000000000000
--- a/model_00066-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6d232ef33521b522e82ef32c028a7b6e44abda264c6f96f1f12e9bb19e566f9f
-size 4932875565
diff --git a/model_00067-of-00072.safetensors b/model_00067-of-00072.safetensors
deleted file mode 100644
index 93d2fb276c92ac61954526b220f6a1ee64b16061..0000000000000000000000000000000000000000
--- a/model_00067-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:72e3b2f771658326d3cd996cb33f067732c6abd6ee1286e1589fff5f1490476a
-size 4932875539
diff --git a/model_00068-of-00072.safetensors b/model_00068-of-00072.safetensors
deleted file mode 100644
index 08ca7a07fc84a051d902a0c0bc1eae82c28a5df5..0000000000000000000000000000000000000000
--- a/model_00068-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b8f53b92e859c9a3e43ece6b880c6ff856bfce1330fca2128d0962d337f82d02
-size 4932875563
diff --git a/model_00069-of-00072.safetensors b/model_00069-of-00072.safetensors
deleted file mode 100644
index 6921dc614e2b4fd58f875b1807d1b47f28cf9892..0000000000000000000000000000000000000000
--- a/model_00069-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8337e26123044033391a46e5b16482e2d1a346be6c285ff5667994e118309169
-size 4932875573
diff --git a/model_00070-of-00072.safetensors b/model_00070-of-00072.safetensors
deleted file mode 100644
index 43c3b0cae25be03f140fc544fccbf701a848dcd8..0000000000000000000000000000000000000000
--- a/model_00070-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e01486eb1cb0d2f1fba83201184096bcb82286955d6844dc328087f291e18b2e
-size 4932875553
diff --git a/model_00071-of-00072.safetensors b/model_00071-of-00072.safetensors
deleted file mode 100644
index 6139e6c3742e606bb749af23e3a920d9357e170a..0000000000000000000000000000000000000000
--- a/model_00071-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:664c40c92e29491cd5fdd96c7f52a55fc0ba90c163f5be651584465de4c68a85
-size 4932875557
diff --git a/model_00072-of-00072.safetensors b/model_00072-of-00072.safetensors
deleted file mode 100644
index 17ce2927a95aa695573e1791410ff4cbc6cfd925..0000000000000000000000000000000000000000
--- a/model_00072-of-00072.safetensors
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:50ed9f636610d9d87c142f54067554ad15a68c20b57f0ad035cf9abebb56a0a6
-size 57530