yuewang-sf commited on
Commit
0d572be
1 Parent(s): 1b8fc1c

Upload tokenizer

Browse files
added_tokens.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {}
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "single_word": false,
5
+ "lstrip": false,
6
+ "rstrip": false,
7
+ "normalized": true
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "single_word": false,
12
+ "lstrip": false,
13
+ "rstrip": false,
14
+ "normalized": true
15
+ },
16
+ "unk_token": {
17
+ "content": "<unk>",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": true
22
+ },
23
+ "sep_token": {
24
+ "content": "</s>",
25
+ "single_word": false,
26
+ "lstrip": false,
27
+ "rstrip": false,
28
+ "normalized": true
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "single_word": false,
33
+ "lstrip": false,
34
+ "rstrip": false,
35
+ "normalized": true
36
+ },
37
+ "cls_token": {
38
+ "content": "<s>",
39
+ "single_word": false,
40
+ "lstrip": false,
41
+ "rstrip": false,
42
+ "normalized": true
43
+ },
44
+ "mask_token": { "content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
45
+ "additional_special_tokens": [
46
+ { "content":"<extra_id_99>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
47
+ { "content":"<extra_id_98>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
48
+ { "content":"<extra_id_97>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
49
+ { "content":"<extra_id_96>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
50
+ { "content":"<extra_id_95>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
51
+ { "content":"<extra_id_94>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
52
+ { "content":"<extra_id_93>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
53
+ { "content":"<extra_id_92>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
54
+ { "content":"<extra_id_91>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
55
+ { "content":"<extra_id_90>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
56
+ { "content":"<extra_id_89>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
57
+ { "content":"<extra_id_88>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
58
+ { "content":"<extra_id_87>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
59
+ { "content":"<extra_id_86>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
60
+ { "content":"<extra_id_85>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
61
+ { "content":"<extra_id_84>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
62
+ { "content":"<extra_id_83>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
63
+ { "content":"<extra_id_82>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
64
+ { "content":"<extra_id_81>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
65
+ { "content":"<extra_id_80>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
66
+ { "content":"<extra_id_79>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
67
+ { "content":"<extra_id_78>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
68
+ { "content":"<extra_id_77>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
69
+ { "content":"<extra_id_76>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
70
+ { "content":"<extra_id_75>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
71
+ { "content":"<extra_id_74>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
72
+ { "content":"<extra_id_73>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
73
+ { "content":"<extra_id_72>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
74
+ { "content":"<extra_id_71>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
75
+ { "content":"<extra_id_70>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
76
+ { "content":"<extra_id_69>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
77
+ { "content":"<extra_id_68>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
78
+ { "content":"<extra_id_67>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
79
+ { "content":"<extra_id_66>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
80
+ { "content":"<extra_id_65>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
81
+ { "content":"<extra_id_64>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
82
+ { "content":"<extra_id_63>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
83
+ { "content":"<extra_id_62>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
84
+ { "content":"<extra_id_61>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
85
+ { "content":"<extra_id_60>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
86
+ { "content":"<extra_id_59>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
87
+ { "content":"<extra_id_58>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
88
+ { "content":"<extra_id_57>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
89
+ { "content":"<extra_id_56>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
90
+ { "content":"<extra_id_55>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
91
+ { "content":"<extra_id_54>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
92
+ { "content":"<extra_id_53>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
93
+ { "content":"<extra_id_52>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
94
+ { "content":"<extra_id_51>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
95
+ { "content":"<extra_id_50>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
96
+ { "content":"<extra_id_49>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
97
+ { "content":"<extra_id_48>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
98
+ { "content":"<extra_id_47>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
99
+ { "content":"<extra_id_46>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
100
+ { "content":"<extra_id_45>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
101
+ { "content":"<extra_id_44>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
102
+ { "content":"<extra_id_43>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
103
+ { "content":"<extra_id_42>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
104
+ { "content":"<extra_id_41>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
105
+ { "content":"<extra_id_40>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
106
+ { "content":"<extra_id_39>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
107
+ { "content":"<extra_id_38>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
108
+ { "content":"<extra_id_37>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
109
+ { "content":"<extra_id_36>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
110
+ { "content":"<extra_id_35>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
111
+ { "content":"<extra_id_34>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
112
+ { "content":"<extra_id_33>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
113
+ { "content":"<extra_id_32>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
114
+ { "content":"<extra_id_31>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
115
+ { "content":"<extra_id_30>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
116
+ { "content":"<extra_id_29>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
117
+ { "content":"<extra_id_28>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
118
+ { "content":"<extra_id_27>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
119
+ { "content":"<extra_id_26>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
120
+ { "content":"<extra_id_25>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
121
+ { "content":"<extra_id_24>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
122
+ { "content":"<extra_id_23>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
123
+ { "content":"<extra_id_22>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
124
+ { "content":"<extra_id_21>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
125
+ { "content":"<extra_id_20>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
126
+ { "content":"<extra_id_19>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
127
+ { "content":"<extra_id_18>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
128
+ { "content":"<extra_id_17>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
129
+ { "content":"<extra_id_16>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
130
+ { "content":"<extra_id_15>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
131
+ { "content":"<extra_id_14>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
132
+ { "content":"<extra_id_13>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
133
+ { "content":"<extra_id_12>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
134
+ { "content":"<extra_id_11>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
135
+ { "content":"<extra_id_10>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
136
+ { "content":"<extra_id_9>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
137
+ { "content":"<extra_id_8>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
138
+ { "content":"<extra_id_7>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
139
+ { "content":"<extra_id_6>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
140
+ { "content":"<extra_id_5>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
141
+ { "content":"<extra_id_4>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
142
+ { "content":"<extra_id_3>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
143
+ { "content":"<extra_id_2>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
144
+ { "content":"<extra_id_1>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true },
145
+ { "content":"<extra_id_0>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true }
146
+ ]
147
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "errors": "replace",
3
+ "unk_token": {
4
+ "content": "<unk>",
5
+ "single_word": false,
6
+ "lstrip": false,
7
+ "rstrip": false,
8
+ "normalized": true,
9
+ "__type": "AddedToken"
10
+ },
11
+ "bos_token": {
12
+ "content": "<s>",
13
+ "single_word": false,
14
+ "lstrip": false,
15
+ "rstrip": false,
16
+ "normalized": true,
17
+ "__type": "AddedToken"
18
+ },
19
+ "eos_token": {
20
+ "content": "</s>",
21
+ "single_word": false,
22
+ "lstrip": false,
23
+ "rstrip": false,
24
+ "normalized": true,
25
+ "__type": "AddedToken"
26
+ },
27
+ "add_prefix_space": false,
28
+ "sep_token": {
29
+ "content": "</s>",
30
+ "single_word": false,
31
+ "lstrip": false,
32
+ "rstrip": false,
33
+ "normalized": true,
34
+ "__type": "AddedToken"
35
+ },
36
+ "cls_token": {
37
+ "content": "<s>",
38
+ "single_word": false,
39
+ "lstrip": false,
40
+ "rstrip": false,
41
+ "normalized": true,
42
+ "__type": "AddedToken"
43
+ },
44
+ "pad_token": {
45
+ "content": "<pad>",
46
+ "single_word": false,
47
+ "lstrip": false,
48
+ "rstrip": false,
49
+ "normalized": true,
50
+ "__type": "AddedToken"
51
+ },
52
+ "mask_token": {
53
+ "content": "<mask>",
54
+ "single_word": false,
55
+ "lstrip": true,
56
+ "rstrip": false,
57
+ "normalized": true,
58
+ "__type": "AddedToken"
59
+ },
60
+ "model_max_length": 512,
61
+ "tokenizer_class": "RobertaTokenizer"
62
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff