michelleyunun commited on
Commit
e1c43e1
1 Parent(s): ae48b9d

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +4 -0
  2. tokenizer.json +861 -0
  3. tokenizer_config.json +8 -0
special_tokens_map.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "eos_token": "<end>",
3
+ "pad_token": "<pad>"
4
+ }
tokenizer.json ADDED
@@ -0,0 +1,861 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "<start>",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "<end>",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "<pad>",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ }
33
+ ],
34
+ "normalizer": null,
35
+ "pre_tokenizer": {
36
+ "type": "ByteLevel",
37
+ "add_prefix_space": false,
38
+ "trim_offsets": true,
39
+ "use_regex": true
40
+ },
41
+ "post_processor": {
42
+ "type": "ByteLevel",
43
+ "add_prefix_space": true,
44
+ "trim_offsets": false,
45
+ "use_regex": true
46
+ },
47
+ "decoder": {
48
+ "type": "ByteLevel",
49
+ "add_prefix_space": true,
50
+ "trim_offsets": true,
51
+ "use_regex": true
52
+ },
53
+ "model": {
54
+ "type": "BPE",
55
+ "dropout": null,
56
+ "unk_token": null,
57
+ "continuing_subword_prefix": null,
58
+ "end_of_word_suffix": null,
59
+ "fuse_unk": false,
60
+ "byte_fallback": false,
61
+ "vocab": {
62
+ "<start>": 0,
63
+ "<end>": 1,
64
+ "<pad>": 2,
65
+ "-": 3,
66
+ ".": 4,
67
+ "1": 5,
68
+ "2": 6,
69
+ "3": 7,
70
+ "<": 8,
71
+ ">": 9,
72
+ "A": 10,
73
+ "B": 11,
74
+ "C": 12,
75
+ "D": 13,
76
+ "E": 14,
77
+ "F": 15,
78
+ "G": 16,
79
+ "I": 17,
80
+ "J": 18,
81
+ "L": 19,
82
+ "M": 20,
83
+ "N": 21,
84
+ "O": 22,
85
+ "P": 23,
86
+ "R": 24,
87
+ "S": 25,
88
+ "T": 26,
89
+ "U": 27,
90
+ "V": 28,
91
+ "W": 29,
92
+ "X": 30,
93
+ "Z": 31,
94
+ "a": 32,
95
+ "b": 33,
96
+ "c": 34,
97
+ "d": 35,
98
+ "e": 36,
99
+ "f": 37,
100
+ "g": 38,
101
+ "h": 39,
102
+ "i": 40,
103
+ "k": 41,
104
+ "l": 42,
105
+ "m": 43,
106
+ "n": 44,
107
+ "o": 45,
108
+ "p": 46,
109
+ "r": 47,
110
+ "s": 48,
111
+ "t": 49,
112
+ "u": 50,
113
+ "v": 51,
114
+ "w": 52,
115
+ "y": 53,
116
+ "Ġ": 54,
117
+ "ar": 55,
118
+ "nd": 56,
119
+ "st": 57,
120
+ "art": 58,
121
+ "end": 59,
122
+ "Ġ<": 60,
123
+ "start": 61,
124
+ "CN": 62,
125
+ "II": 63,
126
+ "ĠC": 64,
127
+ "CNJ": 65,
128
+ "ĠCCNJ": 66,
129
+ "SG": 67,
130
+ "ĠL": 68,
131
+ "OC": 69,
132
+ "ou": 70,
133
+ "ĠI": 71,
134
+ "on": 72,
135
+ "PL": 73,
136
+ "ĠP": 74,
137
+ "ĠLOC": 75,
138
+ "ho": 76,
139
+ "Ġg": 77,
140
+ "Ġs": 78,
141
+ "in": 79,
142
+ "AS": 80,
143
+ "Ġgo": 81,
144
+ "ĠN": 82,
145
+ "ĠD": 83,
146
+ "Ġw": 84,
147
+ "ĠPR": 85,
148
+ "Ġt": 86,
149
+ "ĠIN": 87,
150
+ "Ġ1": 88,
151
+ "Ġp": 89,
152
+ "SP": 90,
153
+ "MP": 91,
154
+ "OMP": 92,
155
+ "ĠCOMP": 93,
156
+ "Ġh": 94,
157
+ "TR": 95,
158
+ "EP": 96,
159
+ "le": 97,
160
+ "ck": 98,
161
+ "Ġl": 99,
162
+ "OSP": 100,
163
+ "ĠPROSP": 101,
164
+ "ke": 102,
165
+ "ll": 103,
166
+ "se": 104,
167
+ "ID": 105,
168
+ "re": 106,
169
+ "Ġar": 107,
170
+ "VB": 108,
171
+ "ĠLVB": 109,
172
+ "ay": 110,
173
+ "Ġin": 111,
174
+ "out": 112,
175
+ "CCNJ": 113,
176
+ "or": 114,
177
+ "CEP": 115,
178
+ "ĠINCEP": 116,
179
+ "EG": 117,
180
+ "FOC": 118,
181
+ "te": 119,
182
+ "ĠNEG": 120,
183
+ "ake": 121,
184
+ "ound": 122,
185
+ "Ġaround": 123,
186
+ "WID": 124,
187
+ "Ġf": 125,
188
+ "ĠDWID": 126,
189
+ "Ġon": 127,
190
+ "PAS": 128,
191
+ "me": 129,
192
+ "PASS": 130,
193
+ "an": 131,
194
+ "ee": 132,
195
+ "pe": 133,
196
+ "EAS": 134,
197
+ "REAS": 135,
198
+ "Ġb": 136,
199
+ "ĠREAS": 137,
200
+ "ouse": 138,
201
+ "Ġpi": 139,
202
+ "ĠS": 140,
203
+ "co": 141,
204
+ "LZ": 142,
205
+ "MLZ": 143,
206
+ "ĠNMLZ": 144,
207
+ "ork": 145,
208
+ "it": 146,
209
+ "Ġm": 147,
210
+ "BL": 148,
211
+ "FV": 149,
212
+ "OBL": 150,
213
+ "PFV": 151,
214
+ "ĠIPFV": 152,
215
+ "AN": 153,
216
+ "mp": 154,
217
+ "to": 155,
218
+ "Ġwho": 156,
219
+ "Ġhouse": 157,
220
+ "Ġpipe": 158,
221
+ "comp": 159,
222
+ "Ġhit": 160,
223
+ "ac": 161,
224
+ "ain": 162,
225
+ "all": 163,
226
+ "Ġtr": 164,
227
+ "ĠINS": 165,
228
+ "ear": 166,
229
+ "ow": 167,
230
+ "oll": 168,
231
+ "Ġtake": 169,
232
+ "Ġfoll": 170,
233
+ "Ġfollow": 171,
234
+ "RR": 172,
235
+ "Ġout": 173,
236
+ "AU": 174,
237
+ "ĠOBL": 175,
238
+ "AUS": 176,
239
+ "ri": 177,
240
+ "ong": 178,
241
+ "SX": 179,
242
+ "ack": 180,
243
+ "lm": 181,
244
+ "ve": 182,
245
+ "Ġo": 183,
246
+ "holm": 184,
247
+ "Ġsee": 185,
248
+ "ckholm": 186,
249
+ "ĠSto": 187,
250
+ "ĠStockholm": 188,
251
+ "CAUS": 189,
252
+ "IP": 190,
253
+ "TIP": 191,
254
+ "ly": 192,
255
+ "od": 193,
256
+ "par": 194,
257
+ "Ġcomp": 195,
258
+ "Ġgood": 196,
259
+ "Ġwork": 197,
260
+ "Ġth": 198,
261
+ "lete": 199,
262
+ "ANTIP": 200,
263
+ "Ġcomplete": 201,
264
+ "Ġcompletely": 202,
265
+ "ER": 203,
266
+ "VER": 204,
267
+ "ĠVER": 205,
268
+ "Ġsay": 206,
269
+ "DM": 207,
270
+ "IS": 208,
271
+ "he": 209,
272
+ "ir": 210,
273
+ "rn": 211,
274
+ "rt": 212,
275
+ "tu": 213,
276
+ "Ġ3": 214,
277
+ "III": 215,
278
+ "hort": 216,
279
+ "Ġshort": 217,
280
+ "Ġli": 218,
281
+ "Ġmake": 219,
282
+ "Ġtry": 220,
283
+ "turn": 221,
284
+ "ab": 222,
285
+ "un": 223,
286
+ "Ġlay": 224,
287
+ "able": 225,
288
+ "AR": 226,
289
+ "ca": 227,
290
+ "do": 228,
291
+ "way": 229,
292
+ "Ġdo": 230,
293
+ "ĠIRR": 231,
294
+ "ĠPAR": 232,
295
+ "SPT": 233,
296
+ "Ġdoor": 234,
297
+ "ĠPART": 235,
298
+ "DE": 236,
299
+ "OX": 237,
300
+ "PR": 238,
301
+ "work": 239,
302
+ "Ġre": 240,
303
+ "home": 241,
304
+ "Ġwh": 242,
305
+ "DEM": 243,
306
+ "PROX": 244,
307
+ "ate": 245,
308
+ "no": 246,
309
+ "so": 247,
310
+ "Ġno": 248,
311
+ "rive": 249,
312
+ "Ġnot": 250,
313
+ "Ġlong": 251,
314
+ "Ġlack": 252,
315
+ "PN": 253,
316
+ "AX": 254,
317
+ "EL": 255,
318
+ "EV": 256,
319
+ "IRR": 257,
320
+ "MAN": 258,
321
+ "ad": 259,
322
+ "ag": 260,
323
+ "at": 261,
324
+ "con": 262,
325
+ "de": 263,
326
+ "ike": 264,
327
+ "ian": 265,
328
+ "ide": 266,
329
+ "long": 267,
330
+ "like": 268,
331
+ "mall": 269,
332
+ "os": 270,
333
+ "ole": 271,
334
+ "pre": 272,
335
+ "pouse": 273,
336
+ "ros": 274,
337
+ "side": 275,
338
+ "tin": 276,
339
+ "uall": 277,
340
+ "year": 278,
341
+ "ĠCN": 279,
342
+ "Ġho": 280,
343
+ "Ġac": 281,
344
+ "ĠAX": 282,
345
+ "Ġcon": 283,
346
+ "Ġyear": 284,
347
+ "ĠCan": 285,
348
+ "ĠPCNJ": 286,
349
+ "Ġgir": 287,
350
+ "Ġsmall": 288,
351
+ "Ġspouse": 289,
352
+ "ĠDM": 290,
353
+ "ĠDIS": 291,
354
+ "Ġwee": 292,
355
+ "Ġpee": 293,
356
+ "Ġpole": 294,
357
+ "ree": 295,
358
+ "rest": 296,
359
+ "Ġinside": 297,
360
+ "any": 298,
361
+ "Ġpick": 299,
362
+ "ĠSEL": 300,
363
+ "Ġman": 301,
364
+ "company": 302,
365
+ "acros": 303,
366
+ "Ġtrain": 304,
367
+ "pare": 305,
368
+ "Ġthree": 306,
369
+ "heart": 307,
370
+ "Ġlie": 308,
371
+ "case": 309,
372
+ "Ġreturn": 310,
373
+ "Ġwhat": 311,
374
+ "EVID": 312,
375
+ "MANR": 313,
376
+ "adian": 314,
377
+ "again": 315,
378
+ "prepare": 316,
379
+ "tinuall": 317,
380
+ "ĠCNTR": 318,
381
+ "Ġhole": 319,
382
+ "Ġaccompany": 320,
383
+ "Ġcontinuall": 321,
384
+ "ĠCanadian": 322,
385
+ "Ġgirl": 323,
386
+ "ĠDISTR": 324,
387
+ "Ġweek": 325,
388
+ "ĠSELF": 326,
389
+ "across": 327,
390
+ "Ġcontinually": 328,
391
+ "ES": 329,
392
+ "ak": 330,
393
+ "eri": 331,
394
+ "epar": 332,
395
+ "gh": 333,
396
+ "ig": 334,
397
+ "ind": 335,
398
+ "Ġun": 336,
399
+ "ough": 337,
400
+ "Ġsepar": 338,
401
+ "ĠDES": 339,
402
+ "Ġperi": 340,
403
+ "Ġhear": 341,
404
+ "reak": 342,
405
+ "Ġarrive": 343,
406
+ "ter": 344,
407
+ "Ġfind": 345,
408
+ "Ġone": 346,
409
+ "meter": 347,
410
+ "Ġback": 348,
411
+ "Ġbig": 349,
412
+ "Ġbreak": 350,
413
+ "Ġoh": 351,
414
+ "Ġthough": 352,
415
+ "Ġunable": 353,
416
+ "Ġseparate": 354,
417
+ "Ġperimeter": 355,
418
+ "Ġthought": 356,
419
+ "fir": 357,
420
+ "ime": 358,
421
+ "lo": 359,
422
+ "pain": 360,
423
+ "run": 361,
424
+ "the": 362,
425
+ "time": 363,
426
+ "Ġco": 364,
427
+ "Ġall": 365,
428
+ "ĠSPT": 366,
429
+ "Ġrun": 367,
430
+ "ĠPREP": 368,
431
+ "EPIS": 369,
432
+ "Ġfear": 370,
433
+ "pen": 371,
434
+ "Ġblo": 372,
435
+ "ĠSpain": 373,
436
+ "Ġopen": 374,
437
+ "Ġlive": 375,
438
+ "first": 376,
439
+ "Ġcome": 377,
440
+ "Ġblock": 378,
441
+ "eca": 379,
442
+ "use": 380,
443
+ "Ġbeca": 381,
444
+ "Ġbecause": 382,
445
+ "AL": 383,
446
+ "BM": 384,
447
+ "VAL": 385,
448
+ "ai": 386,
449
+ "as": 387,
450
+ "ame": 388,
451
+ "ce": 389,
452
+ "en": 390,
453
+ "ep": 391,
454
+ "ff": 392,
455
+ "ite": 393,
456
+ "ice": 394,
457
+ "lac": 395,
458
+ "mar": 396,
459
+ "name": 397,
460
+ "prive": 398,
461
+ "rs": 399,
462
+ "sit": 400,
463
+ "uprive": 401,
464
+ "was": 402,
465
+ "ĠCAUS": 403,
466
+ "ĠIBM": 404,
467
+ "Ġsle": 405,
468
+ "ĠDen": 406,
469
+ "Ġwate": 407,
470
+ "Ġplac": 408,
471
+ "pers": 409,
472
+ "Ġmai": 410,
473
+ "ach": 411,
474
+ "Ġoff": 412,
475
+ "Ġreach": 413,
476
+ "Ġwhite": 414,
477
+ "mark": 415,
478
+ "upriver": 416,
479
+ "wash": 417,
480
+ "Ġsleep": 418,
481
+ "ĠDenmark": 419,
482
+ "Ġwater": 420,
483
+ "Ġplace": 421,
484
+ "person": 422,
485
+ "Ġmail": 423,
486
+ "Ġoffice": 424
487
+ },
488
+ "merges": [
489
+ "a r",
490
+ "n d",
491
+ "s t",
492
+ "ar t",
493
+ "e nd",
494
+ "Ġ <",
495
+ "st art",
496
+ "C N",
497
+ "I I",
498
+ "Ġ C",
499
+ "CN J",
500
+ "ĠC CNJ",
501
+ "S G",
502
+ "Ġ L",
503
+ "O C",
504
+ "o u",
505
+ "Ġ I",
506
+ "o n",
507
+ "P L",
508
+ "Ġ P",
509
+ "ĠL OC",
510
+ "h o",
511
+ "Ġ g",
512
+ "Ġ s",
513
+ "i n",
514
+ "A S",
515
+ "Ġg o",
516
+ "Ġ N",
517
+ "Ġ D",
518
+ "Ġ w",
519
+ "ĠP R",
520
+ "Ġ t",
521
+ "ĠI N",
522
+ "Ġ 1",
523
+ "Ġ p",
524
+ "S P",
525
+ "M P",
526
+ "O MP",
527
+ "ĠC OMP",
528
+ "Ġ h",
529
+ "T R",
530
+ "E P",
531
+ "l e",
532
+ "c k",
533
+ "Ġ l",
534
+ "O SP",
535
+ "ĠPR OSP",
536
+ "k e",
537
+ "l l",
538
+ "s e",
539
+ "I D",
540
+ "r e",
541
+ "Ġ ar",
542
+ "V B",
543
+ "ĠL VB",
544
+ "a y",
545
+ "Ġ in",
546
+ "ou t",
547
+ "C CNJ",
548
+ "o r",
549
+ "C EP",
550
+ "ĠIN CEP",
551
+ "E G",
552
+ "F OC",
553
+ "t e",
554
+ "ĠN EG",
555
+ "a ke",
556
+ "ou nd",
557
+ "Ġar ound",
558
+ "W ID",
559
+ "Ġ f",
560
+ "ĠD WID",
561
+ "Ġ on",
562
+ "P AS",
563
+ "m e",
564
+ "PAS S",
565
+ "a n",
566
+ "e e",
567
+ "p e",
568
+ "E AS",
569
+ "R EAS",
570
+ "Ġ b",
571
+ "Ġ REAS",
572
+ "ou se",
573
+ "Ġp i",
574
+ "Ġ S",
575
+ "c o",
576
+ "L Z",
577
+ "M LZ",
578
+ "ĠN MLZ",
579
+ "or k",
580
+ "i t",
581
+ "Ġ m",
582
+ "B L",
583
+ "F V",
584
+ "O BL",
585
+ "P FV",
586
+ "ĠI PFV",
587
+ "A N",
588
+ "m p",
589
+ "t o",
590
+ "Ġw ho",
591
+ "Ġh ouse",
592
+ "Ġpi pe",
593
+ "co mp",
594
+ "Ġh it",
595
+ "a c",
596
+ "a in",
597
+ "a ll",
598
+ "Ġt r",
599
+ "ĠIN S",
600
+ "e ar",
601
+ "o w",
602
+ "o ll",
603
+ "Ġt ake",
604
+ "Ġf oll",
605
+ "Ġfoll ow",
606
+ "R R",
607
+ "Ġ out",
608
+ "A U",
609
+ "Ġ OBL",
610
+ "AU S",
611
+ "r i",
612
+ "on g",
613
+ "S X",
614
+ "a ck",
615
+ "l m",
616
+ "v e",
617
+ "Ġ o",
618
+ "ho lm",
619
+ "Ġs ee",
620
+ "ck holm",
621
+ "ĠS to",
622
+ "ĠSto ckholm",
623
+ "C AUS",
624
+ "I P",
625
+ "T IP",
626
+ "l y",
627
+ "o d",
628
+ "p ar",
629
+ "Ġ comp",
630
+ "Ġgo od",
631
+ "Ġw ork",
632
+ "Ġt h",
633
+ "le te",
634
+ "AN TIP",
635
+ "Ġcomp lete",
636
+ "Ġcomplete ly",
637
+ "E R",
638
+ "V ER",
639
+ "Ġ VER",
640
+ "Ġs ay",
641
+ "D M",
642
+ "I S",
643
+ "h e",
644
+ "i r",
645
+ "r n",
646
+ "r t",
647
+ "t u",
648
+ "Ġ 3",
649
+ "II I",
650
+ "ho rt",
651
+ "Ġs hort",
652
+ "Ġl i",
653
+ "Ġm ake",
654
+ "Ġtr y",
655
+ "tu rn",
656
+ "a b",
657
+ "u n",
658
+ "Ġl ay",
659
+ "ab le",
660
+ "A R",
661
+ "c a",
662
+ "d o",
663
+ "w ay",
664
+ "Ġ do",
665
+ "ĠI RR",
666
+ "ĠP AR",
667
+ "SP T",
668
+ "Ġdo or",
669
+ "ĠPAR T",
670
+ "D E",
671
+ "O X",
672
+ "P R",
673
+ "w ork",
674
+ "Ġ re",
675
+ "ho me",
676
+ "Ġw h",
677
+ "DE M",
678
+ "PR OX",
679
+ "a te",
680
+ "n o",
681
+ "s o",
682
+ "Ġ no",
683
+ "ri ve",
684
+ "Ġno t",
685
+ "Ġl ong",
686
+ "Ġl ack",
687
+ "P N",
688
+ "A X",
689
+ "E L",
690
+ "E V",
691
+ "I RR",
692
+ "M AN",
693
+ "a d",
694
+ "a g",
695
+ "a t",
696
+ "c on",
697
+ "d e",
698
+ "i ke",
699
+ "i an",
700
+ "i de",
701
+ "l ong",
702
+ "l ike",
703
+ "m all",
704
+ "o s",
705
+ "o le",
706
+ "p re",
707
+ "p ouse",
708
+ "r os",
709
+ "s ide",
710
+ "t in",
711
+ "u all",
712
+ "y ear",
713
+ "Ġ CN",
714
+ "Ġ ho",
715
+ "Ġ ac",
716
+ "Ġ AX",
717
+ "Ġ con",
718
+ "Ġ year",
719
+ "ĠC an",
720
+ "ĠP CNJ",
721
+ "Ġg ir",
722
+ "Ġs mall",
723
+ "Ġs pouse",
724
+ "ĠD M",
725
+ "ĠD IS",
726
+ "Ġw ee",
727
+ "Ġp ee",
728
+ "Ġp ole",
729
+ "re e",
730
+ "re st",
731
+ "Ġin side",
732
+ "an y",
733
+ "Ġpi ck",
734
+ "ĠS EL",
735
+ "Ġm an",
736
+ "comp any",
737
+ "ac ros",
738
+ "Ġtr ain",
739
+ "par e",
740
+ "Ġth ree",
741
+ "he art",
742
+ "Ġli e",
743
+ "ca se",
744
+ "Ġre turn",
745
+ "Ġwh at",
746
+ "EV ID",
747
+ "MAN R",
748
+ "ad ian",
749
+ "ag ain",
750
+ "pre pare",
751
+ "tin uall",
752
+ "ĠCN TR",
753
+ "Ġho le",
754
+ "Ġac company",
755
+ "Ġcon tinuall",
756
+ "ĠCan adian",
757
+ "Ġgir l",
758
+ "ĠDIS TR",
759
+ "Ġwee k",
760
+ "ĠSEL F",
761
+ "acros s",
762
+ "Ġcontinuall y",
763
+ "E S",
764
+ "a k",
765
+ "e ri",
766
+ "e par",
767
+ "g h",
768
+ "i g",
769
+ "i nd",
770
+ "Ġ un",
771
+ "ou gh",
772
+ "Ġs epar",
773
+ "ĠD ES",
774
+ "Ġp eri",
775
+ "Ġh ear",
776
+ "re ak",
777
+ "Ġar rive",
778
+ "te r",
779
+ "Ġf ind",
780
+ "Ġon e",
781
+ "me ter",
782
+ "Ġb ack",
783
+ "Ġb ig",
784
+ "Ġb reak",
785
+ "Ġo h",
786
+ "Ġth ough",
787
+ "Ġun able",
788
+ "Ġsepar ate",
789
+ "Ġperi meter",
790
+ "Ġthough t",
791
+ "f ir",
792
+ "i me",
793
+ "l o",
794
+ "p ain",
795
+ "r un",
796
+ "t he",
797
+ "t ime",
798
+ "Ġ co",
799
+ "Ġ all",
800
+ "Ġ SPT",
801
+ "Ġ run",
802
+ "ĠPR EP",
803
+ "EP IS",
804
+ "Ġf ear",
805
+ "pe n",
806
+ "Ġb lo",
807
+ "ĠS pain",
808
+ "Ġo pen",
809
+ "Ġli ve",
810
+ "fir st",
811
+ "Ġco me",
812
+ "Ġblo ck",
813
+ "e ca",
814
+ "u se",
815
+ "Ġb eca",
816
+ "Ġbeca use",
817
+ "A L",
818
+ "B M",
819
+ "V AL",
820
+ "a i",
821
+ "a s",
822
+ "a me",
823
+ "c e",
824
+ "e n",
825
+ "e p",
826
+ "f f",
827
+ "i te",
828
+ "i ce",
829
+ "l ac",
830
+ "m ar",
831
+ "n ame",
832
+ "p rive",
833
+ "r s",
834
+ "s it",
835
+ "u prive",
836
+ "w as",
837
+ "ĠC AUS",
838
+ "ĠI BM",
839
+ "Ġs le",
840
+ "ĠD en",
841
+ "Ġw ate",
842
+ "Ġp lac",
843
+ "pe rs",
844
+ "Ġm ai",
845
+ "ac h",
846
+ "Ġo ff",
847
+ "Ġre ach",
848
+ "Ġwh ite",
849
+ "mar k",
850
+ "uprive r",
851
+ "was h",
852
+ "Ġsle ep",
853
+ "ĠDen mark",
854
+ "Ġwate r",
855
+ "Ġplac e",
856
+ "pers on",
857
+ "Ġmai l",
858
+ "Ġoff ice"
859
+ ]
860
+ }
861
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "clean_up_tokenization_spaces": true,
3
+ "eos_token": "<end>",
4
+ "model_max_length": 1000000000000000019884624838656,
5
+ "pad_token": "<pad>",
6
+ "sos_token": "<start>",
7
+ "tokenizer_class": "PreTrainedTokenizerFast"
8
+ }