hf-seamless-m4t-large / tokenizer_config.json
ylacombe's picture
Upload processor (#7)
94be387
raw
history blame
21.2 kB
{
"added_tokens_decoder": {
"0": {
"content": "<pad>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"1": {
"content": "<unk>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"2": {
"content": "<s>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"3": {
"content": "</s>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"256001": {
"content": "__afr__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256002": {
"content": "__amh__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256003": {
"content": "__arb__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256004": {
"content": "__ary__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256005": {
"content": "__arz__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256006": {
"content": "__asm__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256007": {
"content": "__azj__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256008": {
"content": "__bel__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256009": {
"content": "__ben__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256010": {
"content": "__bos__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256011": {
"content": "__bul__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256012": {
"content": "__cat__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256013": {
"content": "__ceb__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256014": {
"content": "__ces__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256015": {
"content": "__ckb__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256016": {
"content": "__cmn__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256017": {
"content": "__cmn_Hant__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256018": {
"content": "__cym__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256019": {
"content": "__dan__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256020": {
"content": "__deu__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256021": {
"content": "__ell__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256022": {
"content": "__eng__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256023": {
"content": "__est__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256024": {
"content": "__eus__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256025": {
"content": "__fin__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256026": {
"content": "__fra__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256027": {
"content": "__fuv__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256028": {
"content": "__gaz__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256029": {
"content": "__gle__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256030": {
"content": "__glg__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256031": {
"content": "__guj__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256032": {
"content": "__heb__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256033": {
"content": "__hin__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256034": {
"content": "__hrv__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256035": {
"content": "__hun__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256036": {
"content": "__hye__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256037": {
"content": "__ibo__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256038": {
"content": "__ind__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256039": {
"content": "__isl__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256040": {
"content": "__ita__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256041": {
"content": "__jav__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256042": {
"content": "__jpn__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256043": {
"content": "__kan__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256044": {
"content": "__kat__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256045": {
"content": "__kaz__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256046": {
"content": "__khk__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256047": {
"content": "__khm__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256048": {
"content": "__kir__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256049": {
"content": "__kor__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256050": {
"content": "__lao__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256051": {
"content": "__lit__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256052": {
"content": "__lug__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256053": {
"content": "__luo__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256054": {
"content": "__lvs__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256055": {
"content": "__mai__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256056": {
"content": "__mal__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256057": {
"content": "__mar__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256058": {
"content": "__mkd__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256059": {
"content": "__mlt__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256060": {
"content": "__mni__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256061": {
"content": "__mya__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256062": {
"content": "__nld__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256063": {
"content": "__nno__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256064": {
"content": "__nob__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256065": {
"content": "__npi__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256066": {
"content": "__nya__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256067": {
"content": "__ory__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256068": {
"content": "__pan__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256069": {
"content": "__pbt__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256070": {
"content": "__pes__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256071": {
"content": "__pol__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256072": {
"content": "__por__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256073": {
"content": "__ron__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256074": {
"content": "__rus__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256075": {
"content": "__sat__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256076": {
"content": "__slk__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256077": {
"content": "__slv__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256078": {
"content": "__sna__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256079": {
"content": "__snd__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256080": {
"content": "__som__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256081": {
"content": "__spa__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256082": {
"content": "__srp__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256083": {
"content": "__swe__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256084": {
"content": "__swh__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256085": {
"content": "__tam__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256086": {
"content": "__tel__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256087": {
"content": "__tgk__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256088": {
"content": "__tgl__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256089": {
"content": "__tha__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256090": {
"content": "__tur__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256091": {
"content": "__ukr__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256092": {
"content": "__urd__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256093": {
"content": "__uzn__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256094": {
"content": "__vie__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256095": {
"content": "__yor__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256096": {
"content": "__yue__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256097": {
"content": "__zlm__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256098": {
"content": "__zul__",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256099": {
"content": "<MINED_DATA>",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256100": {
"content": "<MMT_BT_DATA>",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
},
"256101": {
"content": "<SMT_BT_DATA>",
"lstrip": true,
"normalized": false,
"rstrip": true,
"single_word": false,
"special": true
}
},
"additional_special_tokens": [
"__afr__",
"__amh__",
"__arb__",
"__ary__",
"__arz__",
"__asm__",
"__azj__",
"__bel__",
"__ben__",
"__bos__",
"__bul__",
"__cat__",
"__ceb__",
"__ces__",
"__ckb__",
"__cmn__",
"__cmn_Hant__",
"__cym__",
"__dan__",
"__deu__",
"__ell__",
"__eng__",
"__est__",
"__eus__",
"__fin__",
"__fra__",
"__fuv__",
"__gaz__",
"__gle__",
"__glg__",
"__guj__",
"__heb__",
"__hin__",
"__hrv__",
"__hun__",
"__hye__",
"__ibo__",
"__ind__",
"__isl__",
"__ita__",
"__jav__",
"__jpn__",
"__kan__",
"__kat__",
"__kaz__",
"__khk__",
"__khm__",
"__kir__",
"__kor__",
"__lao__",
"__lit__",
"__lug__",
"__luo__",
"__lvs__",
"__mai__",
"__mal__",
"__mar__",
"__mkd__",
"__mlt__",
"__mni__",
"__mya__",
"__nld__",
"__nno__",
"__nob__",
"__npi__",
"__nya__",
"__ory__",
"__pan__",
"__pbt__",
"__pes__",
"__pol__",
"__por__",
"__ron__",
"__rus__",
"__sat__",
"__slk__",
"__slv__",
"__sna__",
"__snd__",
"__som__",
"__spa__",
"__srp__",
"__swe__",
"__swh__",
"__tam__",
"__tel__",
"__tgk__",
"__tgl__",
"__tha__",
"__tur__",
"__ukr__",
"__urd__",
"__uzn__",
"__vie__",
"__yor__",
"__yue__",
"__zlm__",
"__zul__",
"<MINED_DATA>",
"<MMT_BT_DATA>",
"<SMT_BT_DATA>"
],
"bos_token": "<s>",
"clean_up_tokenization_spaces": true,
"cls_token": "<s>",
"eos_token": "</s>",
"language_code": [
"afr",
"amh",
"arb",
"ary",
"arz",
"asm",
"azj",
"bel",
"ben",
"bos",
"bul",
"cat",
"ceb",
"ces",
"ckb",
"cmn",
"cmn_Hant",
"cym",
"dan",
"deu",
"ell",
"eng",
"est",
"eus",
"fin",
"fra",
"fuv",
"gaz",
"gle",
"glg",
"guj",
"heb",
"hin",
"hrv",
"hun",
"hye",
"ibo",
"ind",
"isl",
"ita",
"jav",
"jpn",
"kan",
"kat",
"kaz",
"khk",
"khm",
"kir",
"kor",
"lao",
"lit",
"lug",
"luo",
"lvs",
"mai",
"mal",
"mar",
"mkd",
"mlt",
"mni",
"mya",
"nld",
"nno",
"nob",
"npi",
"nya",
"ory",
"pan",
"pbt",
"pes",
"pol",
"por",
"ron",
"rus",
"sat",
"slk",
"slv",
"sna",
"snd",
"som",
"spa",
"srp",
"swe",
"swh",
"tam",
"tel",
"tgk",
"tgl",
"tha",
"tur",
"ukr",
"urd",
"uzn",
"vie",
"yor",
"yue",
"zlm",
"zul"
],
"model_max_length": 1000000000000000019884624838656,
"pad_token": "<pad>",
"processor_class": "SeamlessM4TProcessor",
"sep_token": "</s>",
"sp_model_kwargs": {},
"src_lang": "__eng__",
"tgt_lang": "__fra__",
"tokenizer_class": "SeamlessM4TTokenizer",
"tokenizer_file": null,
"unk_token": "<unk>"
}