git-trg / tokenizer.json
michelleyunun's picture
Upload tokenizer
4b4f58c
raw
history blame
13.8 kB
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "<start>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "<end>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "<pad>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "ByteLevel",
"add_prefix_space": false,
"trim_offsets": true,
"use_regex": true
},
"post_processor": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": false,
"use_regex": true
},
"decoder": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": true,
"use_regex": true
},
"model": {
"type": "BPE",
"dropout": null,
"unk_token": null,
"continuing_subword_prefix": null,
"end_of_word_suffix": null,
"fuse_unk": false,
"byte_fallback": false,
"vocab": {
"<start>": 0,
"<end>": 1,
"<pad>": 2,
"-": 3,
".": 4,
"1": 5,
"2": 6,
"3": 7,
"<": 8,
">": 9,
"A": 10,
"B": 11,
"C": 12,
"D": 13,
"E": 14,
"F": 15,
"G": 16,
"I": 17,
"J": 18,
"L": 19,
"M": 20,
"N": 21,
"O": 22,
"P": 23,
"R": 24,
"S": 25,
"T": 26,
"U": 27,
"V": 28,
"W": 29,
"X": 30,
"Z": 31,
"a": 32,
"b": 33,
"c": 34,
"d": 35,
"e": 36,
"f": 37,
"g": 38,
"h": 39,
"i": 40,
"k": 41,
"l": 42,
"m": 43,
"n": 44,
"o": 45,
"p": 46,
"r": 47,
"s": 48,
"t": 49,
"u": 50,
"v": 51,
"w": 52,
"y": 53,
"Ġ": 54,
"ar": 55,
"nd": 56,
"st": 57,
"art": 58,
"end": 59,
"Ġ<": 60,
"start": 61,
"CN": 62,
"II": 63,
"ĠC": 64,
"CNJ": 65,
"ĠCCNJ": 66,
"SG": 67,
"ĠL": 68,
"OC": 69,
"ou": 70,
"ĠI": 71,
"on": 72,
"PL": 73,
"ĠP": 74,
"ĠLOC": 75,
"ho": 76,
"Ġg": 77,
"Ġs": 78,
"in": 79,
"AS": 80,
"Ġgo": 81,
"ĠN": 82,
"ĠD": 83,
"Ġw": 84,
"ĠPR": 85,
"Ġt": 86,
"ĠIN": 87,
"Ġ1": 88,
"Ġp": 89,
"SP": 90,
"MP": 91,
"OMP": 92,
"ĠCOMP": 93,
"Ġh": 94,
"TR": 95,
"EP": 96,
"le": 97,
"ck": 98,
"Ġl": 99,
"OSP": 100,
"ĠPROSP": 101,
"ke": 102,
"ll": 103,
"se": 104,
"ID": 105,
"re": 106,
"Ġar": 107,
"VB": 108,
"ĠLVB": 109,
"ay": 110,
"Ġin": 111,
"out": 112,
"CCNJ": 113,
"or": 114,
"CEP": 115,
"ĠINCEP": 116,
"EG": 117,
"FOC": 118,
"te": 119,
"ĠNEG": 120,
"ake": 121,
"ound": 122,
"Ġaround": 123,
"WID": 124,
"Ġf": 125,
"ĠDWID": 126,
"Ġon": 127,
"PAS": 128,
"me": 129,
"PASS": 130,
"an": 131,
"ee": 132,
"pe": 133,
"EAS": 134,
"REAS": 135,
"Ġb": 136,
"ĠREAS": 137,
"ouse": 138,
"Ġpi": 139,
"ĠS": 140,
"co": 141,
"LZ": 142,
"MLZ": 143,
"ĠNMLZ": 144,
"ork": 145,
"it": 146,
"Ġm": 147,
"BL": 148,
"FV": 149,
"OBL": 150,
"PFV": 151,
"ĠIPFV": 152,
"AN": 153,
"mp": 154,
"to": 155,
"Ġwho": 156,
"Ġhouse": 157,
"Ġpipe": 158,
"comp": 159,
"Ġhit": 160,
"ac": 161,
"ain": 162,
"all": 163,
"Ġtr": 164,
"ĠINS": 165,
"ear": 166,
"ow": 167,
"oll": 168,
"Ġtake": 169,
"Ġfoll": 170,
"Ġfollow": 171,
"RR": 172,
"Ġout": 173,
"AU": 174,
"ĠOBL": 175,
"AUS": 176,
"ri": 177,
"ong": 178,
"SX": 179,
"ack": 180,
"lm": 181,
"ve": 182,
"Ġo": 183,
"holm": 184,
"Ġsee": 185,
"ckholm": 186,
"ĠSto": 187,
"ĠStockholm": 188,
"CAUS": 189,
"IP": 190,
"TIP": 191,
"ly": 192,
"od": 193,
"par": 194,
"Ġcomp": 195,
"Ġgood": 196,
"Ġwork": 197,
"Ġth": 198,
"lete": 199,
"ANTIP": 200,
"Ġcomplete": 201,
"Ġcompletely": 202,
"ER": 203,
"VER": 204,
"ĠVER": 205,
"Ġsay": 206,
"DM": 207,
"IS": 208,
"he": 209,
"ir": 210,
"rn": 211,
"rt": 212,
"tu": 213,
"Ġ3": 214,
"III": 215,
"hort": 216,
"Ġshort": 217,
"Ġli": 218,
"Ġmake": 219,
"Ġtry": 220,
"turn": 221,
"ab": 222,
"un": 223,
"Ġlay": 224,
"able": 225,
"AR": 226,
"ca": 227,
"do": 228,
"way": 229,
"Ġdo": 230,
"ĠIRR": 231,
"ĠPAR": 232,
"SPT": 233,
"Ġdoor": 234,
"ĠPART": 235,
"DE": 236,
"OX": 237,
"PR": 238,
"work": 239,
"Ġre": 240,
"home": 241,
"Ġwh": 242,
"DEM": 243,
"PROX": 244,
"ate": 245,
"no": 246,
"so": 247,
"Ġno": 248,
"rive": 249,
"Ġnot": 250,
"Ġlong": 251,
"Ġlack": 252,
"PN": 253,
"AX": 254,
"EL": 255,
"EV": 256,
"IRR": 257,
"MAN": 258,
"ad": 259,
"ag": 260,
"at": 261,
"con": 262,
"de": 263,
"ike": 264,
"ian": 265,
"ide": 266,
"long": 267,
"like": 268,
"mall": 269,
"os": 270,
"ole": 271,
"pre": 272,
"pouse": 273,
"ros": 274,
"side": 275,
"tin": 276,
"uall": 277,
"year": 278,
"ĠCN": 279,
"Ġho": 280,
"Ġac": 281,
"ĠAX": 282,
"Ġcon": 283,
"Ġyear": 284,
"ĠCan": 285,
"ĠPCNJ": 286,
"Ġgir": 287,
"Ġsmall": 288,
"Ġspouse": 289,
"ĠDM": 290,
"ĠDIS": 291,
"Ġwee": 292,
"Ġpee": 293,
"Ġpole": 294,
"ree": 295,
"rest": 296,
"Ġinside": 297,
"any": 298,
"Ġpick": 299,
"ĠSEL": 300,
"Ġman": 301,
"company": 302,
"acros": 303,
"Ġtrain": 304,
"pare": 305,
"Ġthree": 306,
"heart": 307,
"Ġlie": 308,
"case": 309,
"Ġreturn": 310,
"Ġwhat": 311,
"EVID": 312,
"MANR": 313,
"adian": 314,
"again": 315,
"prepare": 316,
"tinuall": 317,
"ĠCNTR": 318,
"Ġhole": 319,
"Ġaccompany": 320,
"Ġcontinuall": 321,
"ĠCanadian": 322,
"Ġgirl": 323,
"ĠDISTR": 324,
"Ġweek": 325,
"ĠSELF": 326,
"across": 327,
"Ġcontinually": 328,
"ES": 329,
"ak": 330,
"eri": 331,
"epar": 332,
"gh": 333,
"ig": 334,
"ind": 335,
"Ġun": 336,
"ough": 337,
"Ġsepar": 338,
"ĠDES": 339,
"Ġperi": 340,
"Ġhear": 341,
"reak": 342,
"Ġarrive": 343,
"ter": 344,
"Ġfind": 345,
"Ġone": 346,
"meter": 347,
"Ġback": 348,
"Ġbig": 349,
"Ġbreak": 350,
"Ġoh": 351,
"Ġthough": 352,
"Ġunable": 353,
"Ġseparate": 354,
"Ġperimeter": 355,
"Ġthought": 356,
"fir": 357,
"ime": 358,
"lo": 359,
"pain": 360,
"run": 361,
"the": 362,
"time": 363,
"Ġco": 364,
"Ġall": 365,
"ĠSPT": 366,
"Ġrun": 367,
"ĠPREP": 368,
"EPIS": 369,
"Ġfear": 370,
"pen": 371,
"Ġblo": 372,
"ĠSpain": 373,
"Ġopen": 374,
"Ġlive": 375,
"first": 376,
"Ġcome": 377,
"Ġblock": 378,
"eca": 379,
"use": 380,
"Ġbeca": 381,
"Ġbecause": 382,
"AL": 383,
"BM": 384,
"VAL": 385,
"ai": 386,
"as": 387,
"ame": 388,
"ce": 389,
"en": 390,
"ep": 391,
"ff": 392,
"ite": 393,
"ice": 394,
"lac": 395,
"mar": 396,
"name": 397,
"prive": 398,
"rs": 399
},
"merges": [
"a r",
"n d",
"s t",
"ar t",
"e nd",
"Ġ <",
"st art",
"C N",
"I I",
"Ġ C",
"CN J",
"ĠC CNJ",
"S G",
"Ġ L",
"O C",
"o u",
"Ġ I",
"o n",
"P L",
"Ġ P",
"ĠL OC",
"h o",
"Ġ g",
"Ġ s",
"i n",
"A S",
"Ġg o",
"Ġ N",
"Ġ D",
"Ġ w",
"ĠP R",
"Ġ t",
"ĠI N",
"Ġ 1",
"Ġ p",
"S P",
"M P",
"O MP",
"ĠC OMP",
"Ġ h",
"T R",
"E P",
"l e",
"c k",
"Ġ l",
"O SP",
"ĠPR OSP",
"k e",
"l l",
"s e",
"I D",
"r e",
"Ġ ar",
"V B",
"ĠL VB",
"a y",
"Ġ in",
"ou t",
"C CNJ",
"o r",
"C EP",
"ĠIN CEP",
"E G",
"F OC",
"t e",
"ĠN EG",
"a ke",
"ou nd",
"Ġar ound",
"W ID",
"Ġ f",
"ĠD WID",
"Ġ on",
"P AS",
"m e",
"PAS S",
"a n",
"e e",
"p e",
"E AS",
"R EAS",
"Ġ b",
"Ġ REAS",
"ou se",
"Ġp i",
"Ġ S",
"c o",
"L Z",
"M LZ",
"ĠN MLZ",
"or k",
"i t",
"Ġ m",
"B L",
"F V",
"O BL",
"P FV",
"ĠI PFV",
"A N",
"m p",
"t o",
"Ġw ho",
"Ġh ouse",
"Ġpi pe",
"co mp",
"Ġh it",
"a c",
"a in",
"a ll",
"Ġt r",
"ĠIN S",
"e ar",
"o w",
"o ll",
"Ġt ake",
"Ġf oll",
"Ġfoll ow",
"R R",
"Ġ out",
"A U",
"Ġ OBL",
"AU S",
"r i",
"on g",
"S X",
"a ck",
"l m",
"v e",
"Ġ o",
"ho lm",
"Ġs ee",
"ck holm",
"ĠS to",
"ĠSto ckholm",
"C AUS",
"I P",
"T IP",
"l y",
"o d",
"p ar",
"Ġ comp",
"Ġgo od",
"Ġw ork",
"Ġt h",
"le te",
"AN TIP",
"Ġcomp lete",
"Ġcomplete ly",
"E R",
"V ER",
"Ġ VER",
"Ġs ay",
"D M",
"I S",
"h e",
"i r",
"r n",
"r t",
"t u",
"Ġ 3",
"II I",
"ho rt",
"Ġs hort",
"Ġl i",
"Ġm ake",
"Ġtr y",
"tu rn",
"a b",
"u n",
"Ġl ay",
"ab le",
"A R",
"c a",
"d o",
"w ay",
"Ġ do",
"ĠI RR",
"ĠP AR",
"SP T",
"Ġdo or",
"ĠPAR T",
"D E",
"O X",
"P R",
"w ork",
"Ġ re",
"ho me",
"Ġw h",
"DE M",
"PR OX",
"a te",
"n o",
"s o",
"Ġ no",
"ri ve",
"Ġno t",
"Ġl ong",
"Ġl ack",
"P N",
"A X",
"E L",
"E V",
"I RR",
"M AN",
"a d",
"a g",
"a t",
"c on",
"d e",
"i ke",
"i an",
"i de",
"l ong",
"l ike",
"m all",
"o s",
"o le",
"p re",
"p ouse",
"r os",
"s ide",
"t in",
"u all",
"y ear",
"Ġ CN",
"Ġ ho",
"Ġ ac",
"Ġ AX",
"Ġ con",
"Ġ year",
"ĠC an",
"ĠP CNJ",
"Ġg ir",
"Ġs mall",
"Ġs pouse",
"ĠD M",
"ĠD IS",
"Ġw ee",
"Ġp ee",
"Ġp ole",
"re e",
"re st",
"Ġin side",
"an y",
"Ġpi ck",
"ĠS EL",
"Ġm an",
"comp any",
"ac ros",
"Ġtr ain",
"par e",
"Ġth ree",
"he art",
"Ġli e",
"ca se",
"Ġre turn",
"Ġwh at",
"EV ID",
"MAN R",
"ad ian",
"ag ain",
"pre pare",
"tin uall",
"ĠCN TR",
"Ġho le",
"Ġac company",
"Ġcon tinuall",
"ĠCan adian",
"Ġgir l",
"ĠDIS TR",
"Ġwee k",
"ĠSEL F",
"acros s",
"Ġcontinuall y",
"E S",
"a k",
"e ri",
"e par",
"g h",
"i g",
"i nd",
"Ġ un",
"ou gh",
"Ġs epar",
"ĠD ES",
"Ġp eri",
"Ġh ear",
"re ak",
"Ġar rive",
"te r",
"Ġf ind",
"Ġon e",
"me ter",
"Ġb ack",
"Ġb ig",
"Ġb reak",
"Ġo h",
"Ġth ough",
"Ġun able",
"Ġsepar ate",
"Ġperi meter",
"Ġthough t",
"f ir",
"i me",
"l o",
"p ain",
"r un",
"t he",
"t ime",
"Ġ co",
"Ġ all",
"Ġ SPT",
"Ġ run",
"ĠPR EP",
"EP IS",
"Ġf ear",
"pe n",
"Ġb lo",
"ĠS pain",
"Ġo pen",
"Ġli ve",
"fir st",
"Ġco me",
"Ġblo ck",
"e ca",
"u se",
"Ġb eca",
"Ġbeca use",
"A L",
"B M",
"V AL",
"a i",
"a s",
"a me",
"c e",
"e n",
"e p",
"f f",
"i te",
"i ce",
"l ac",
"m ar",
"n ame",
"p rive",
"r s"
]
}
}