test_vocab_101 / tokenizer.json
ronanki's picture
Upload tokenizer
5058dba
raw
history blame
13.9 kB
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "[PAD]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "[CLS]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "[SEP]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "[UNK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 4,
"content": "[MASK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": {
"type": "Sequence",
"normalizers": [
{
"type": "NFD"
},
{
"type": "Lowercase"
},
{
"type": "StripAccents"
}
]
},
"pre_tokenizer": {
"type": "Whitespace"
},
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"SpecialToken": {
"id": "[CLS]",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 0
}
}
],
"pair": [
{
"SpecialToken": {
"id": "[CLS]",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 1
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 1
}
}
],
"special_tokens": {
"[CLS]": {
"id": "[CLS]",
"ids": [
1
],
"tokens": [
"[CLS]"
]
},
"[SEP]": {
"id": "[SEP]",
"ids": [
2
],
"tokens": [
"[SEP]"
]
}
}
},
"decoder": null,
"model": {
"type": "WordPiece",
"unk_token": "[UNK]",
"continuing_subword_prefix": "##",
"max_input_chars_per_word": 100,
"vocab": {
"[PAD]": 0,
"[CLS]": 1,
"[SEP]": 2,
"[UNK]": 3,
"[MASK]": 4,
"\"": 5,
"%": 6,
"&": 7,
"'": 8,
"(": 9,
")": 10,
"*": 11,
"+": 12,
",": 13,
"-": 14,
".": 15,
"/": 16,
"0": 17,
"1": 18,
"2": 19,
"3": 20,
"4": 21,
"5": 22,
"6": 23,
"7": 24,
"8": 25,
"9": 26,
":": 27,
";": 28,
"<": 29,
"=": 30,
">": 31,
"[": 32,
"]": 33,
"_": 34,
"a": 35,
"b": 36,
"c": 37,
"d": 38,
"e": 39,
"f": 40,
"g": 41,
"h": 42,
"i": 43,
"j": 44,
"k": 45,
"l": 46,
"m": 47,
"n": 48,
"o": 49,
"p": 50,
"q": 51,
"r": 52,
"s": 53,
"t": 54,
"u": 55,
"v": 56,
"w": 57,
"x": 58,
"y": 59,
"z": 60,
"~": 61,
"®": 62,
"°": 63,
"²": 64,
"³": 65,
"µ": 66,
"ø": 67,
"α": 68,
"–": 69,
"##u": 70,
"##l": 71,
"##i": 72,
"##p": 73,
"##w": 74,
"##o": 75,
"##d": 76,
"##e": 77,
"##s": 78,
"##g": 79,
"##n": 80,
"##t": 81,
"##c": 82,
"##m": 83,
"##h": 84,
"##r": 85,
"##v": 86,
"##0": 87,
"##a": 88,
"##y": 89,
"##7": 90,
"##2": 91,
"##1": 92,
"##4": 93,
"##b": 94,
"##x": 95,
"##6": 96,
"##3": 97,
"##5": 98,
"##f": 99,
"##j": 100,
"##k": 101,
"##;": 102,
"##z": 103,
"##µ": 104,
"##8": 105,
"##9": 106,
"##%": 107,
"##(": 108,
"##q": 109,
"##]": 110,
"##)": 111,
"##,": 112,
"##_": 113,
"##\"": 114,
"##>": 115,
"##/": 116,
"##ø": 117,
"##<": 118,
"##=": 119,
"##-": 120,
"##er": 121,
"##in": 122,
"##on": 123,
"##at": 124,
"##ed": 125,
"##ro": 126,
"##en": 127,
"##an": 128,
"##ic": 129,
"##as": 130,
"##ar": 131,
"##or": 132,
"##al": 133,
"##ct": 134,
"##it": 135,
"##ol": 136,
"##il": 137,
"##ion": 138,
"##ing": 139,
"##es": 140,
"##et": 141,
"##el": 142,
"##id": 143,
"in": 144,
"##le": 145,
"tr": 146,
"##um": 147,
"##yl": 148,
"en": 149,
"##ent": 150,
"##te": 151,
"##re": 152,
"##ation": 153,
"co": 154,
"##st": 155,
"##od": 156,
"pl": 157,
"##am": 158,
"##rom": 159,
"con": 160,
"##ity": 161,
"##80": 162,
"##ric": 163,
"##ul": 164,
"di": 165,
"from": 166,
"##ene": 167,
"##ig": 168,
"##580": 169,
"##5804": 170,
"pro": 171,
"ele": 172,
"##ut": 173,
"##ck": 174,
"re": 175,
"elect": 176,
"##lo": 177,
"##ur": 178,
"##ate": 179,
"a1": 180,
"a3": 181,
"##ad": 182,
"al": 183,
"##ix": 184,
"##ated": 185,
"pa": 186,
"##igh": 187,
"##ine": 188,
"##du": 189,
"##oly": 190,
"mix": 191,
"##ac": 192,
"car": 193,
"ste": 194,
"at": 195,
"##ood": 196,
"st": 197,
"electric": 198,
"##ow": 199,
"##im": 200,
"##sp": 201,
"##em": 202,
"##ium": 203,
"20": 204,
"gr": 205,
"poly": 206,
"##ard": 207,
"##eth": 208,
"##ap": 209,
"##ill": 210,
"mm": 211,
"electricity": 212,
"##uck": 213,
"truck": 214,
"##ip": 215,
"##ber": 216,
"##ch": 217,
"##duct": 218,
"##15804": 219,
"en15804": 220,
"cont": 221,
"##pp": 222,
"##ort": 223,
"##ide": 224,
"##ight": 225,
"plan": 226,
"sh": 227,
"of": 228,
"##nd": 229,
"##ast": 230,
"product": 231,
"##oad": 232,
"##eat": 233,
"##ge": 234,
"##wood": 235,
"##ased": 236,
"pow": 237,
"steel": 238,
"##ver": 239,
"##ess": 240,
"ac": 241,
"##ater": 242,
"##00": 243,
"##and": 244,
"content": 245,
"par": 246,
"##ri": 247,
"com": 248,
"ro": 249,
"was": 250,
"##yload": 251,
"payload": 252,
"##th": 253,
"ch": 254,
"##ist": 255,
"##ter": 256,
"##of": 257,
"fi": 258,
"gas": 259,
"##ass": 260,
"waste": 261,
"cap": 262,
"##ylene": 263,
"gro": 264,
"##ith": 265,
"##ane": 266,
"power": 267,
"tran": 268,
"##cl": 269,
"15804": 270,
"app": 271,
"based": 272,
"##is": 273,
"capac": 274,
"##ure": 275,
"th": 276,
"and": 277,
"water": 278,
"##om": 279,
"plant": 280,
"for": 281,
"un": 282,
"##led": 283,
"grid": 284,
"##us": 285,
"##eet": 286,
"part": 287,
"##yd": 288,
"##2o": 289,
"##est": 290,
"##old": 291,
"by": 292,
"##co": 293,
"oil": 294,
"##eight": 295,
"##ce": 296,
"bo": 297,
"##rox": 298,
"res": 299,
"##gy": 300,
"##ss": 301,
"##aw": 302,
"sheet": 303,
"h2o": 304,
"us": 305,
"##ly": 306,
"with": 307,
"##esel": 308,
"transp": 309,
"capacity": 310,
"mo": 311,
"##iner": 312,
"transport": 313,
"##uro": 314,
"weight": 315,
"##iz": 316,
"),": 317,
"##bon": 318,
"bi": 319,
"on": 320,
"##ergy": 321,
"diesel": 322,
"energy": 323,
"euro": 324,
"##un": 325,
"##lu": 326,
"##rect": 327,
"##age": 328,
"approx": 329,
"se": 330,
"gross": 331,
"carbon": 332,
"cr": 333,
"##imation": 334,
"approximation": 335,
"##cy": 336,
"##ment": 337,
"##fill": 338,
"to": 339,
"12": 340,
"acid": 341,
"land": 342,
"##umin": 343,
"op": 344,
"##ug": 345,
"##av": 346,
"landfill": 347,
"##ial": 348,
"alumin": 349,
"##mm": 350,
"##put": 351,
"##ural": 352,
"allo": 353,
"##cess": 354,
"fiber": 355,
"eth": 356,
"powered": 357,
"process": 358,
"production": 359,
"ex": 360,
"vd": 361,
"##act": 362,
"moist": 363,
"alloc": 364,
"##iv": 365,
"##cr": 366,
"##ain": 367,
"##lor": 368,
"##ab": 369,
"de": 370,
"##mg": 371,
"hyd": 372,
"kw": 373,
"pr": 374,
"##ile": 375,
"##one": 376,
"moisture": 377,
"hard": 378,
"##amine": 379,
"rol": 380,
"open": 381,
"##os": 382,
"##all": 383,
"te": 384,
"##ne": 385,
"##iler": 386,
"##con": 387,
"##for": 388,
"kg": 389,
"##erm": 390,
"dri": 391,
"input": 392,
"comp": 393,
"cf": 394,
"so": 395,
"cem": 396,
"##ph": 397,
"treat": 398,
"treatment": 399,
"ag": 400,
"lc": 401,
"##der": 402,
"allocation": 403,
"rolled": 404,
"flo": 405,
"mun": 406,
"##icip": 407,
"##ctor": 408,
"plast": 409,
"roof": 410,
"municip": 411,
"municipal": 412,
"10": 413,
"cold": 414,
"##pe": 415,
"##ay": 416,
"##verage": 417,
"average": 418,
"##cycl": 419,
"recycl": 420,
"dried": 421,
"meth": 422,
"wood": 423,
"ep": 424,
"nat": 425,
"##br": 426,
"##og": 427,
"60": 428,
"##ance": 429,
"201": 430,
"natural": 431,
"steam": 432,
"aluminium": 433,
"##ite": 434,
"therm": 435,
"high": 436,
"wh": 437,
"##di": 438,
"##sion": 439,
"reg": 440,
"##ot": 441,
"14": 442,
"##uel": 443,
"##bo": 444,
"pan": 445,
"ply": 446,
"fuel": 447,
"hydro": 448,
"ha": 449,
"mod": 450,
"sol": 451,
"##ub": 452,
"##ust": 453,
"##lass": 454,
"##ir": 455,
"conne": 456,
"connector": 457,
"c3": 458,
"lum": 459,
"inc": 460,
"chlor": 461,
"region": 462,
"eg": 463,
"sof": 464,
"##sul": 465,
"##ry": 466,
"coated": 467,
"floor": 468,
"soft": 469,
"cor": 470,
"light": 471,
"comb": 472,
"board": 473,
"lumber": 474,
"c4": 475,
"##kv": 476,
"kil": 477,
"mel": 478,
"kiln": 479,
"bf": 480,
"coat": 481,
"a5": 482,
"recycling": 483,
"##x1": 484,
"##ical": 485,
"inciner": 486,
"##mer": 487,
"haul": 488,
"##ens": 489,
"##ode": 490,
"cut": 491,
"fl": 492,
"he": 493,
"pet": 494,
"up": 495,
"##anul": 496,
"direct": 497,
"granul": 498,
"incineration": 499,
"heat": 500,
"melamine": 501,
"but": 502,
"softwood": 503,
"rene": 504,
"insul": 505,
"200": 506,
"vda": 507,
"cement": 508,
"renew": 509,
"e1": 510,
"##ethane": 511,
"15": 512,
"pin": 513,
"##uf": 514,
"##dust": 515,
"25": 516,
"hd": 517,
"sm": 518,
"sy": 519,
"slu": 520,
"##ia": 521,
"##op": 522,
"##dge": 523,
"##yp": 524,
"##int": 525,
"##ingle": 526,
"dist": 527,
"roofing": 528,
"granulate": 529,
"sludge": 530,
"mill": 531,
"sp": 532,
"##ding": 533,
"electr": 534,
"##board": 535,
"##ue": 536,
"##if": 537,
"##ong": 538,
"flooring": 539,
"22": 540,
"fo": 541,
"pe": 542,
"##infor": 543,
"reinfor": 544,
"reinforc": 545,
"##ied": 546,
"##ox": 547,
"##tr": 548,
"mem": 549,
"##ive": 550,
"nm": 551,
"single": 552,
"##per": 553,
"##fin": 554,
"##icle": 555,
"incl": 556,
"##sph": 557,
"tex": 558,
"plywood": 559,
"model": 560,
"16": 561,
"18": 562,
"man": 563,
"ph": 564,
"saw": 565,
"##oline": 566,
"indi": 567,
"gasoline": 568,
"indirect": 569,
"dw": 570,
"##yr": 571,
"##ete": 572,
"##ell": 573,
"resist": 574,
"hardwood": 575,
"sil": 576,
"##anol": 577,
"ba": 578,
"coating": 579,
"000": 580,
"pip": 581,
"##ool": 582,
"boiler": 583,
"sul": 584,
"indust": 585,
"membr": 586,
"##ous": 587,
"##ides": 588,
"##crete": 589,
"membrane": 590,
"resin": 591,
"insulation": 592,
"32": 593,
"econ": 594,
"ic": 595,
"econom": 596,
"pv": 597,
"##pylene": 598,
"##hyd": 599
}
}
}