Duplicate from jbochi/madlad400-3b-mt

Browse files

Co-authored-by: J Bochi <jbochi@users.noreply.huggingface.co>

Files changed (13) hide show

.gitattributes +39 -0
README.md +633 -0
added_tokens.json +2 -0
config.json +29 -0
generation_config.json +7 -0
model-q2k.gguf +3 -0
model-q3k.gguf +3 -0
model-q4k.gguf +3 -0
model.safetensors +3 -0
special_tokens_map.json +23 -0
spiece.model +3 -0
tokenizer.json +3 -0
tokenizer_config.json +38 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,39 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text
+model-q4k.gguf filter=lfs diff=lfs merge=lfs -text
+model-q2k.gguf filter=lfs diff=lfs merge=lfs -text
+model-q3k.gguf filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,633 @@

+---
+license: apache-2.0
+language:
+- multilingual
+- en
+- ru
+- es
+- fr
+- de
+- it
+- pt
+- pl
+- nl
+- vi
+- tr
+- sv
+- id
+- ro
+- cs
+- zh
+- hu
+- ja
+- th
+- fi
+- fa
+- uk
+- da
+- el
+- "no"
+- bg
+- sk
+- ko
+- ar
+- lt
+- ca
+- sl
+- he
+- et
+- lv
+- hi
+- sq
+- ms
+- az
+- sr
+- ta
+- hr
+- kk
+- is
+- ml
+- mr
+- te
+- af
+- gl
+- fil
+- be
+- mk
+- eu
+- bn
+- ka
+- mn
+- bs
+- uz
+- ur
+- sw
+- yue
+- ne
+- kn
+- kaa
+- gu
+- si
+- cy
+- eo
+- la
+- hy
+- ky
+- tg
+- ga
+- mt
+- my
+- km
+- tt
+- so
+- ku
+- ps
+- pa
+- rw
+- lo
+- ha
+- dv
+- fy
+- lb
+- ckb
+- mg
+- gd
+- am
+- ug
+- ht
+- grc
+- hmn
+- sd
+- jv
+- mi
+- tk
+- ceb
+- yi
+- ba
+- fo
+- or
+- xh
+- su
+- kl
+- ny
+- sm
+- sn
+- co
+- zu
+- ig
+- yo
+- pap
+- st
+- haw
+- as
+- oc
+- cv
+- lus
+- tet
+- gsw
+- sah
+- br
+- rm
+- sa
+- bo
+- om
+- se
+- ce
+- cnh
+- ilo
+- hil
+- udm
+- os
+- lg
+- ti
+- vec
+- ts
+- tyv
+- kbd
+- ee
+- iba
+- av
+- kha
+- to
+- tn
+- nso
+- fj
+- zza
+- ak
+- ada
+- otq
+- dz
+- bua
+- cfm
+- ln
+- chm
+- gn
+- krc
+- wa
+- hif
+- yua
+- srn
+- war
+- rom
+- bik
+- pam
+- sg
+- lu
+- ady
+- kbp
+- syr
+- ltg
+- myv
+- iso
+- kac
+- bho
+- ay
+- kum
+- qu
+- za
+- pag
+- ngu
+- ve
+- pck
+- zap
+- tyz
+- hui
+- bbc
+- tzo
+- tiv
+- ksd
+- gom
+- min
+- ang
+- nhe
+- bgp
+- nzi
+- nnb
+- nv
+- zxx
+- bci
+- kv
+- new
+- mps
+- alt
+- meu
+- bew
+- fon
+- iu
+- abt
+- mgh
+- mnw
+- tvl
+- dov
+- tlh
+- ho
+- kw
+- mrj
+- meo
+- crh
+- mbt
+- emp
+- ace
+- ium
+- mam
+- gym
+- mai
+- crs
+- pon
+- ubu
+- fip
+- quc
+- gv
+- kj
+- btx
+- ape
+- chk
+- rcf
+- shn
+- tzh
+- mdf
+- ppk
+- ss
+- gag
+- cab
+- kri
+- seh
+- ibb
+- tbz
+- bru
+- enq
+- ach
+- cuk
+- kmb
+- wo
+- kek
+- qub
+- tab
+- bts
+- kos
+- rwo
+- cak
+- tuc
+- bum
+- cjk
+- gil
+- stq
+- tsg
+- quh
+- mak
+- arn
+- ban
+- jiv
+- sja
+- yap
+- tcy
+- toj
+- twu
+- xal
+- amu
+- rmc
+- hus
+- nia
+- kjh
+- bm
+- guh
+- mas
+- acf
+- dtp
+- ksw
+- bzj
+- din
+- zne
+- mad
+- msi
+- mag
+- mkn
+- kg
+- lhu
+- ch
+- qvi
+- mh
+- djk
+- sus
+- mfe
+- srm
+- dyu
+- ctu
+- gui
+- pau
+- inb
+- bi
+- mni
+- guc
+- jam
+- wal
+- jac
+- bas
+- gor
+- skr
+- nyu
+- noa
+- sda
+- gub
+- nog
+- cni
+- teo
+- tdx
+- sxn
+- rki
+- nr
+- frp
+- alz
+- taj
+- lrc
+- cce
+- rn
+- jvn
+- hvn
+- nij
+- dwr
+- izz
+- msm
+- bus
+- ktu
+- chr
+- maz
+- tzj
+- suz
+- knj
+- bim
+- gvl
+- bqc
+- tca
+- pis
+- prk
+- laj
+- mel
+- qxr
+- niq
+- ahk
+- shp
+- hne
+- spp
+- koi
+- krj
+- quf
+- luz
+- agr
+- tsc
+- mqy
+- gof
+- gbm
+- miq
+- dje
+- awa
+- bjj
+- qvz
+- sjp
+- tll
+- raj
+- kjg
+- bgz
+- quy
+- cbk
+- akb
+- oj
+- ify
+- mey
+- ks
+- cac
+- brx
+- qup
+- syl
+- jax
+- ff
+- ber
+- tks
+- trp
+- mrw
+- adh
+- smt
+- srr
+- ffm
+- qvc
+- mtr
+- ann
+- kaa
+- aa
+- noe
+- nut
+- gyn
+- kwi
+- xmm
+- msb
+library_name: transformers
+tags:
+- text2text-generation
+- text-generation-inference
+datasets:
+- allenai/MADLAD-400
+pipeline_tag: translation
+widget:
+- text: "<2en> Como vai, amigo?"
+  example_title: "Translation to English"
+- text: "<2de> Do you speak German?"
+  example_title: "Translation to German"
+---
+# Model Card for MADLAD-400-3B-MT
+#  Table of Contents
+0. [TL;DR](#TL;DR)
+1. [Model Details](#model-details)
+2. [Usage](#usage)
+3. [Uses](#uses)
+4. [Bias, Risks, and Limitations](#bias-risks-and-limitations)
+5. [Training Details](#training-details)
+6. [Evaluation](#evaluation)
+7. [Environmental Impact](#environmental-impact)
+8. [Citation](#citation)
+# TL;DR
+MADLAD-400-3B-MT is a multilingual machine translation model based on the T5 architecture that was
+trained on 1 trillion tokens covering over 450 languages using publicly available data.
+It is competitive with models that are significantly larger.
+**Disclaimer**: [Juarez Bochi](https://huggingface.co/jbochi), who was not involved in this research, converted
+the original weights and wrote the contents of this model card based on the original paper and Flan-T5.
+# Model Details
+## Model Description
+- **Model type:** Language model
+- **Language(s) (NLP):** Multilingual (400+ languages)
+- **License:** Apache 2.0
+- **Related Models:** [All MADLAD-400 Checkpoints](https://huggingface.co/models?search=madlad)
+- **Original Checkpoints:** [All Original MADLAD-400 Checkpoints](https://github.com/google-research/google-research/tree/master/madlad_400)
+- **Resources for more information:**
+  - [Research paper](https://arxiv.org/abs/2309.04662)
+  - [GitHub Repo](https://github.com/google-research/t5x)
+  - [Hugging Face MADLAD-400 Docs (Similar to T5) ](https://huggingface.co/docs/transformers/model_doc/MADLAD-400) - [Pending PR](https://github.com/huggingface/transformers/pull/27471)
+# Usage
+Find below some example scripts on how to use the model:
+## Using the Pytorch model with `transformers`
+### Running the model on a CPU or GPU
+<details>
+<summary> Click to expand </summary>
+First, install the Python packages that are required:
+`pip install transformers accelerate sentencepiece`
+```python
+from transformers import T5ForConditionalGeneration, T5Tokenizer
+model_name = 'jbochi/madlad400-3b-mt'
+model = T5ForConditionalGeneration.from_pretrained(model_name, device_map="auto")
+tokenizer = T5Tokenizer.from_pretrained(model_name)
+text = "<2pt> I love pizza!"
+input_ids = tokenizer(text, return_tensors="pt").input_ids.to(model.device)
+outputs = model.generate(input_ids=input_ids)
+tokenizer.decode(outputs[0], skip_special_tokens=True)
+# Eu adoro pizza!
+```
+</details>
+## Running the model with Candle
+<details>
+<summary> Click to expand </summary>
+Usage with [candle](https://github.com/huggingface/candle):
+```bash
+$ cargo run --example t5 --release  -- \
+  --model-id "jbochi/madlad400-3b-mt" \
+  --prompt "<2de> How are you, my friend?" \
+  --decode --temperature 0
+```
+We also provide a quantized model (1.65 GB vs the original 11.8 GB file):
+```
+cargo run --example quantized-t5 --release  -- \
+  --model-id "jbochi/madlad400-3b-mt" --weight-file "model-q4k.gguf" \
+  --prompt "<2de> How are you, my friend?" \
+  --temperature 0
+...
+ Wie geht es dir, mein Freund?
+```
+</details>
+# Uses
+## Direct Use and Downstream Use
+> Primary intended uses: Machine Translation and multilingual NLP tasks on over 400 languages.
+> Primary intended users: Research community.
+## Out-of-Scope Use
+> These models are trained on general domain data and are therefore not meant to
+> work on domain-specific models out-of-the box. Moreover, these research models have not been assessed
+> for production usecases.
+# Bias, Risks, and Limitations
+> We note that we evaluate on only 204 of the languages supported by these models and on machine translation
+> and few-shot machine translation tasks. Users must consider use of this model carefully for their own
+> usecase.
+## Ethical considerations and risks
+> We trained these models with MADLAD-400 and publicly available data to create baseline models that
+> support NLP for over 400 languages, with a focus on languages underrepresented in large-scale corpora.
+> Given that these models were trained with web-crawled datasets that may contain sensitive, offensive or
+> otherwise low-quality content despite extensive preprocessing, it is still possible that these issues to the
+> underlying training data may cause differences in model performance and toxic (or otherwise problematic)
+> output for certain domains. Moreover, large models are dual use technologies that have specific risks
+> associated with their use and development. We point the reader to surveys such as those written by
+> Weidinger et al. or Bommasani et al. for a more detailed discussion of these risks, and to Liebling
+> et al. for a thorough discussion of the risks of machine translation systems.
+## Known Limitations
+More information needed
+## Sensitive Use:
+More information needed
+# Training Details
+> We train models of various sizes: a 3B, 32-layer parameter model,
+> a 7.2B 48-layer parameter model and a 10.7B 32-layer parameter model.
+> We share all parameters of the model across language pairs,
+> and use a Sentence Piece Model with 256k tokens shared on both the encoder and decoder
+> side. Each input sentence has a <2xx> token prepended to the source sentence to indicate the target
+> language.
+See the [research paper](https://arxiv.org/pdf/2309.04662.pdf) for further details.
+## Training Data
+> For both the machine translation and language model, MADLAD-400 is used. For the machine translation
+> model, a combination of parallel datasources covering 157 languages is also used. Further details are
+> described in the [paper](https://arxiv.org/pdf/2309.04662.pdf).
+## Training Procedure
+See the [research paper](https://arxiv.org/pdf/2309.04662.pdf) for further details.
+# Evaluation
+## Testing Data, Factors & Metrics
+> For evaluation, we used WMT, NTREX, Flores-200 and Gatones datasets as described in Section 4.3 in the [paper](https://arxiv.org/pdf/2309.04662.pdf).
+> The translation quality of this model varies based on language, as seen in the paper, and likely varies on
+> domain, though we have not assessed this.
+## Results
+![image/png](https://cdn-uploads.huggingface.co/production/uploads/64b7f632037d6452a321fa15/EzsMD1AwCuFH0S0DeD-n8.png)
+![image/png](https://cdn-uploads.huggingface.co/production/uploads/64b7f632037d6452a321fa15/CJ5zCUVy7vTU76Lc8NZcK.png)
+![image/png](https://cdn-uploads.huggingface.co/production/uploads/64b7f632037d6452a321fa15/NK0S-yVeWuhKoidpLYh3m.png)
+See the [research paper](https://arxiv.org/pdf/2309.04662.pdf) for further details.
+# Environmental Impact
+More information needed
+# Citation
+**BibTeX:**
+```bibtex
+@misc{kudugunta2023madlad400,
+      title={MADLAD-400: A Multilingual And Document-Level Large Audited Dataset},
+      author={Sneha Kudugunta and Isaac Caswell and Biao Zhang and Xavier Garcia and Christopher A. Choquette-Choo and Katherine Lee and Derrick Xin and Aditya Kusupati and Romi Stella and Ankur Bapna and Orhan Firat},
+      year={2023},
+      eprint={2309.04662},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```

added_tokens.json ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ {
2	+ }

config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "architectures": [
+        "T5ForConditionalGeneration"
+    ],
+    "d_ff": 8192,
+    "d_kv": 128,
+    "d_model": 1024,
+    "dropout_rate": 0.1,
+    "decoder_start_token_id": 0,
+    "pad_token_id": 1,
+    "eos_token_id": 2,
+    "feed_forward_proj": "gated-gelu",
+    "initializer_factor": 1.0,
+    "is_encoder_decoder": true,
+    "layer_norm_epsilon": 1e-06,
+    "model_type": "t5",
+    "n_positions": 512,
+    "num_decoder_layers": 32,
+    "num_heads": 16,
+    "num_layers": 32,
+    "output_past": true,
+    "relative_attention_max_distance": 128,
+    "relative_attention_num_buckets": 32,
+    "task_specific_params": {},
+    "tie_word_embeddings": false,
+    "transformers_version": "4.23.1",
+    "use_cache": true,
+    "vocab_size": 256000
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "decoder_start_token_id": 0,
+  "eos_token_id": 2,
+  "pad_token_id": 1,
+  "transformers_version": "4.35.0"
+}

model-q2k.gguf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:abab3b4cf39c076b94517e5b7f53bf43a70e72032d787dde948c56a1c597df5d
+size 965486240

model-q3k.gguf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6d9850c77c4610037085b2595f5cfde13a23a70296596209d8d61b7cc0d34e6e
+size 1264101024

model-q4k.gguf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ea6e5531a3e95213c7f0635988d119e078a655c09306e47851e15d4c0c3f9c37
+size 1654597280

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:66ff5f8fcaf92291da486fdfbd4d5233cec90e1359348a56e3172c978b3a76d4
+size 11761587872

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

spiece.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ef11ac9a22c7503492f56d48dce53be20e339b63605983e9f27d2cd0e0f3922c
+size 4427844

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a2799ccc696b752ba00c34f58726bfe253a04921ceb6cfc620400f560474790b
+size 16629031

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [],
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "</s>",
+  "extra_ids": 0,
+  "legacy": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<s>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "T5Tokenizer",
+  "unk_token": "<unk>"
+}