git-trg / tokenizer.json
michelleyunun's picture
Upload tokenizer
bb3915b
raw
history blame
3.64 kB
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "<start>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "<end>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "<pad>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "ByteLevel",
"add_prefix_space": false,
"trim_offsets": true,
"use_regex": true
},
"post_processor": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": false,
"use_regex": true
},
"decoder": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": true,
"use_regex": true
},
"model": {
"type": "BPE",
"dropout": null,
"unk_token": null,
"continuing_subword_prefix": null,
"end_of_word_suffix": null,
"fuse_unk": false,
"byte_fallback": false,
"vocab": {
"<start>": 0,
"<end>": 1,
"<pad>": 2,
"\"": 3,
"'": 4,
",": 5,
"-": 6,
".": 7,
"<": 8,
">": 9,
"A": 10,
"B": 11,
"C": 12,
"D": 13,
"G": 14,
"H": 15,
"I": 16,
"M": 17,
"N": 18,
"O": 19,
"S": 20,
"a": 21,
"b": 22,
"c": 23,
"d": 24,
"e": 25,
"f": 26,
"g": 27,
"h": 28,
"i": 29,
"j": 30,
"k": 31,
"l": 32,
"m": 33,
"n": 34,
"o": 35,
"p": 36,
"r": 37,
"s": 38,
"t": 39,
"u": 40,
"w": 41,
"x": 42,
"y": 43,
"²": 44,
"Ì": 45,
"Ġ": 46,
"st": 47,
"nd": 48,
"ar": 49,
"end": 50,
"Ġ<": 51,
"star": 52,
"start": 53,
"hl": 54,
"̲": 55,
"wi": 56,
"ii": 57,
"Ġg": 58,
"aa": 59,
"oo": 60,
"Ġn": 61,
"Ġwi": 62,
"Ġ'": 63,
"Ġii": 64,
"an": 65,
"Ġy": 66,
"Ġl": 67,
"Ii": 68,
"ĠIi": 69,
"oohl": 70,
"ee": 71,
"im": 72,
"Ġwil": 73,
"Ġh": 74,
"whl": 75,
"Ġhl": 76,
"ag": 77,
"dii": 78,
"nii": 79,
"ts": 80,
"xwi": 81,
"Ġd": 82,
"Ġha": 83,
"uu": 84,
"Ġnee": 85,
"xs": 86,
"Ġyu": 87,
"Ġa": 88,
"ip": 89,
"kwhl": 90,
"wihl": 91,
"gi": 92,
"Ġk": 93,
"xw": 94,
"'m": 95,
"Ġxs": 96,
"Ġdim": 97,
"Ġneedii": 98,
"igi": 99
},
"merges": [
"s t",
"n d",
"a r",
"e nd",
"Ġ <",
"st ar",
"star t",
"h l",
"Ì ²",
"w i",
"i i",
"Ġ g",
"a a",
"o o",
"Ġ n",
"Ġ wi",
"Ġ '",
"Ġ ii",
"a n",
"Ġ y",
"Ġ l",
"I i",
"Ġ Ii",
"oo hl",
"e e",
"i m",
"Ġwi l",
"Ġ h",
"w hl",
"Ġ hl",
"a g",
"d ii",
"n ii",
"t s",
"x wi",
"Ġ d",
"Ġh a",
"u u",
"Ġn ee",
"x s",
"Ġy u",
"Ġ a",
"i p",
"k whl",
"wi hl",
"g i",
"Ġ k",
"x w",
"' m",
"Ġ xs",
"Ġd im",
"Ġnee dii",
"i gi"
]
}
}