RajSang commited on
Commit
1ae6205
1 Parent(s): b28cb1a

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +2 -14
  2. tokenizer.json +31 -10
  3. tokenizer_config.json +2 -5
special_tokens_map.json CHANGED
@@ -1,19 +1,7 @@
1
  {
2
  "additional_special_tokens": [
3
- {
4
- "content": "[V0]",
5
- "lstrip": false,
6
- "normalized": false,
7
- "rstrip": false,
8
- "single_word": false
9
- },
10
- {
11
- "content": "[V1]",
12
- "lstrip": false,
13
- "normalized": false,
14
- "rstrip": false,
15
- "single_word": false
16
- }
17
  ],
18
  "cls_token": "[CLS]",
19
  "mask_token": "[MASK]",
 
1
  {
2
  "additional_special_tokens": [
3
+ "[V0]",
4
+ "[V1]"
 
 
 
 
 
 
 
 
 
 
 
 
5
  ],
6
  "cls_token": "[CLS]",
7
  "mask_token": "[MASK]",
tokenizer.json CHANGED
@@ -82,7 +82,13 @@
82
  "single": [
83
  {
84
  "SpecialToken": {
85
- "id": "[CLS]",
 
 
 
 
 
 
86
  "type_id": 0
87
  }
88
  },
@@ -102,7 +108,13 @@
102
  "pair": [
103
  {
104
  "SpecialToken": {
105
- "id": "[CLS]",
 
 
 
 
 
 
106
  "type_id": 0
107
  }
108
  },
@@ -132,22 +144,31 @@
132
  }
133
  ],
134
  "special_tokens": {
135
- "[CLS]": {
136
- "id": "[CLS]",
137
  "ids": [
138
- 101
139
  ],
140
  "tokens": [
141
- "[CLS]"
142
  ]
143
  },
144
- "[SEP]": {
145
- "id": "[SEP]",
146
  "ids": [
147
- 102
148
  ],
149
  "tokens": [
150
- "[SEP]"
 
 
 
 
 
 
 
 
 
151
  ]
152
  }
153
  }
 
82
  "single": [
83
  {
84
  "SpecialToken": {
85
+ "id": "[V0]",
86
+ "type_id": 0
87
+ }
88
+ },
89
+ {
90
+ "SpecialToken": {
91
+ "id": "[V1]",
92
  "type_id": 0
93
  }
94
  },
 
108
  "pair": [
109
  {
110
  "SpecialToken": {
111
+ "id": "[V0]",
112
+ "type_id": 0
113
+ }
114
+ },
115
+ {
116
+ "SpecialToken": {
117
+ "id": "[V1]",
118
  "type_id": 0
119
  }
120
  },
 
144
  }
145
  ],
146
  "special_tokens": {
147
+ "[SEP]": {
148
+ "id": "[SEP]",
149
  "ids": [
150
+ 102
151
  ],
152
  "tokens": [
153
+ "[SEP]"
154
  ]
155
  },
156
+ "[V0]": {
157
+ "id": "[V0]",
158
  "ids": [
159
+ 30522
160
  ],
161
  "tokens": [
162
+ "[V0]"
163
+ ]
164
+ },
165
+ "[V1]": {
166
+ "id": "[V1]",
167
+ "ids": [
168
+ 30523
169
+ ],
170
+ "tokens": [
171
+ "[V1]"
172
  ]
173
  }
174
  }
tokenizer_config.json CHANGED
@@ -63,13 +63,10 @@
63
  ],
64
  "clean_up_tokenization_spaces": true,
65
  "cls_token": "[CLS]",
66
- "do_lower_case": true,
67
  "mask_token": "[MASK]",
68
- "model_max_length": 512,
69
  "pad_token": "[PAD]",
70
  "sep_token": "[SEP]",
71
- "strip_accents": null,
72
- "tokenize_chinese_chars": true,
73
- "tokenizer_class": "BertTokenizer",
74
  "unk_token": "[UNK]"
75
  }
 
63
  ],
64
  "clean_up_tokenization_spaces": true,
65
  "cls_token": "[CLS]",
 
66
  "mask_token": "[MASK]",
67
+ "model_max_length": 1000000000000000019884624838656,
68
  "pad_token": "[PAD]",
69
  "sep_token": "[SEP]",
70
+ "tokenizer_class": "PreTrainedTokenizerFast",
 
 
71
  "unk_token": "[UNK]"
72
  }