RajSang commited on
Commit
5539dbb
1 Parent(s): cd874d7

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +2 -14
  2. tokenizer.json +103 -10
  3. tokenizer_config.json +2 -5
special_tokens_map.json CHANGED
@@ -1,19 +1,7 @@
1
  {
2
  "additional_special_tokens": [
3
- {
4
- "content": "[V0]",
5
- "lstrip": false,
6
- "normalized": false,
7
- "rstrip": false,
8
- "single_word": false
9
- },
10
- {
11
- "content": "[V1]",
12
- "lstrip": false,
13
- "normalized": false,
14
- "rstrip": false,
15
- "single_word": false
16
- }
17
  ],
18
  "cls_token": "[CLS]",
19
  "mask_token": "[MASK]",
 
1
  {
2
  "additional_special_tokens": [
3
+ "[V0]",
4
+ "[V1]"
 
 
 
 
 
 
 
 
 
 
 
 
5
  ],
6
  "cls_token": "[CLS]",
7
  "mask_token": "[MASK]",
tokenizer.json CHANGED
@@ -82,7 +82,49 @@
82
  "single": [
83
  {
84
  "SpecialToken": {
85
- "id": "[CLS]",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  "type_id": 0
87
  }
88
  },
@@ -102,7 +144,49 @@
102
  "pair": [
103
  {
104
  "SpecialToken": {
105
- "id": "[CLS]",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  "type_id": 0
107
  }
108
  },
@@ -132,22 +216,31 @@
132
  }
133
  ],
134
  "special_tokens": {
135
- "[CLS]": {
136
- "id": "[CLS]",
 
 
 
 
 
 
 
 
 
137
  "ids": [
138
- 101
139
  ],
140
  "tokens": [
141
- "[CLS]"
142
  ]
143
  },
144
- "[SEP]": {
145
- "id": "[SEP]",
146
  "ids": [
147
- 102
148
  ],
149
  "tokens": [
150
- "[SEP]"
151
  ]
152
  }
153
  }
 
82
  "single": [
83
  {
84
  "SpecialToken": {
85
+ "id": "[V0]",
86
+ "type_id": 0
87
+ }
88
+ },
89
+ {
90
+ "SpecialToken": {
91
+ "id": "[V1]",
92
+ "type_id": 0
93
+ }
94
+ },
95
+ {
96
+ "SpecialToken": {
97
+ "id": "[V1]",
98
+ "type_id": 0
99
+ }
100
+ },
101
+ {
102
+ "SpecialToken": {
103
+ "id": "[V1]",
104
+ "type_id": 0
105
+ }
106
+ },
107
+ {
108
+ "SpecialToken": {
109
+ "id": "[V1]",
110
+ "type_id": 0
111
+ }
112
+ },
113
+ {
114
+ "SpecialToken": {
115
+ "id": "[V1]",
116
+ "type_id": 0
117
+ }
118
+ },
119
+ {
120
+ "SpecialToken": {
121
+ "id": "[V1]",
122
+ "type_id": 0
123
+ }
124
+ },
125
+ {
126
+ "SpecialToken": {
127
+ "id": "[V1]",
128
  "type_id": 0
129
  }
130
  },
 
144
  "pair": [
145
  {
146
  "SpecialToken": {
147
+ "id": "[V0]",
148
+ "type_id": 0
149
+ }
150
+ },
151
+ {
152
+ "SpecialToken": {
153
+ "id": "[V1]",
154
+ "type_id": 0
155
+ }
156
+ },
157
+ {
158
+ "SpecialToken": {
159
+ "id": "[V1]",
160
+ "type_id": 0
161
+ }
162
+ },
163
+ {
164
+ "SpecialToken": {
165
+ "id": "[V1]",
166
+ "type_id": 0
167
+ }
168
+ },
169
+ {
170
+ "SpecialToken": {
171
+ "id": "[V1]",
172
+ "type_id": 0
173
+ }
174
+ },
175
+ {
176
+ "SpecialToken": {
177
+ "id": "[V1]",
178
+ "type_id": 0
179
+ }
180
+ },
181
+ {
182
+ "SpecialToken": {
183
+ "id": "[V1]",
184
+ "type_id": 0
185
+ }
186
+ },
187
+ {
188
+ "SpecialToken": {
189
+ "id": "[V1]",
190
  "type_id": 0
191
  }
192
  },
 
216
  }
217
  ],
218
  "special_tokens": {
219
+ "[SEP]": {
220
+ "id": "[SEP]",
221
+ "ids": [
222
+ 102
223
+ ],
224
+ "tokens": [
225
+ "[SEP]"
226
+ ]
227
+ },
228
+ "[V0]": {
229
+ "id": "[V0]",
230
  "ids": [
231
+ 30522
232
  ],
233
  "tokens": [
234
+ "[V0]"
235
  ]
236
  },
237
+ "[V1]": {
238
+ "id": "[V1]",
239
  "ids": [
240
+ 30523
241
  ],
242
  "tokens": [
243
+ "[V1]"
244
  ]
245
  }
246
  }
tokenizer_config.json CHANGED
@@ -63,13 +63,10 @@
63
  ],
64
  "clean_up_tokenization_spaces": true,
65
  "cls_token": "[CLS]",
66
- "do_lower_case": true,
67
  "mask_token": "[MASK]",
68
- "model_max_length": 512,
69
  "pad_token": "[PAD]",
70
  "sep_token": "[SEP]",
71
- "strip_accents": null,
72
- "tokenize_chinese_chars": true,
73
- "tokenizer_class": "BertTokenizer",
74
  "unk_token": "[UNK]"
75
  }
 
63
  ],
64
  "clean_up_tokenization_spaces": true,
65
  "cls_token": "[CLS]",
 
66
  "mask_token": "[MASK]",
67
+ "model_max_length": 1000000000000000019884624838656,
68
  "pad_token": "[PAD]",
69
  "sep_token": "[SEP]",
70
+ "tokenizer_class": "PreTrainedTokenizerFast",
 
 
71
  "unk_token": "[UNK]"
72
  }