RajSang commited on
Commit
cd874d7
1 Parent(s): 1ae6205

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +14 -2
  2. tokenizer.json +10 -31
  3. tokenizer_config.json +5 -2
special_tokens_map.json CHANGED
@@ -1,7 +1,19 @@
1
  {
2
  "additional_special_tokens": [
3
- "[V0]",
4
- "[V1]"
 
 
 
 
 
 
 
 
 
 
 
 
5
  ],
6
  "cls_token": "[CLS]",
7
  "mask_token": "[MASK]",
 
1
  {
2
  "additional_special_tokens": [
3
+ {
4
+ "content": "[V0]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "[V1]",
12
+ "lstrip": false,
13
+ "normalized": false,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ }
17
  ],
18
  "cls_token": "[CLS]",
19
  "mask_token": "[MASK]",
tokenizer.json CHANGED
@@ -82,13 +82,7 @@
82
  "single": [
83
  {
84
  "SpecialToken": {
85
- "id": "[V0]",
86
- "type_id": 0
87
- }
88
- },
89
- {
90
- "SpecialToken": {
91
- "id": "[V1]",
92
  "type_id": 0
93
  }
94
  },
@@ -108,13 +102,7 @@
108
  "pair": [
109
  {
110
  "SpecialToken": {
111
- "id": "[V0]",
112
- "type_id": 0
113
- }
114
- },
115
- {
116
- "SpecialToken": {
117
- "id": "[V1]",
118
  "type_id": 0
119
  }
120
  },
@@ -144,31 +132,22 @@
144
  }
145
  ],
146
  "special_tokens": {
147
- "[SEP]": {
148
- "id": "[SEP]",
149
  "ids": [
150
- 102
151
  ],
152
  "tokens": [
153
- "[SEP]"
154
  ]
155
  },
156
- "[V0]": {
157
- "id": "[V0]",
158
- "ids": [
159
- 30522
160
- ],
161
- "tokens": [
162
- "[V0]"
163
- ]
164
- },
165
- "[V1]": {
166
- "id": "[V1]",
167
  "ids": [
168
- 30523
169
  ],
170
  "tokens": [
171
- "[V1]"
172
  ]
173
  }
174
  }
 
82
  "single": [
83
  {
84
  "SpecialToken": {
85
+ "id": "[CLS]",
 
 
 
 
 
 
86
  "type_id": 0
87
  }
88
  },
 
102
  "pair": [
103
  {
104
  "SpecialToken": {
105
+ "id": "[CLS]",
 
 
 
 
 
 
106
  "type_id": 0
107
  }
108
  },
 
132
  }
133
  ],
134
  "special_tokens": {
135
+ "[CLS]": {
136
+ "id": "[CLS]",
137
  "ids": [
138
+ 101
139
  ],
140
  "tokens": [
141
+ "[CLS]"
142
  ]
143
  },
144
+ "[SEP]": {
145
+ "id": "[SEP]",
 
 
 
 
 
 
 
 
 
146
  "ids": [
147
+ 102
148
  ],
149
  "tokens": [
150
+ "[SEP]"
151
  ]
152
  }
153
  }
tokenizer_config.json CHANGED
@@ -63,10 +63,13 @@
63
  ],
64
  "clean_up_tokenization_spaces": true,
65
  "cls_token": "[CLS]",
 
66
  "mask_token": "[MASK]",
67
- "model_max_length": 1000000000000000019884624838656,
68
  "pad_token": "[PAD]",
69
  "sep_token": "[SEP]",
70
- "tokenizer_class": "PreTrainedTokenizerFast",
 
 
71
  "unk_token": "[UNK]"
72
  }
 
63
  ],
64
  "clean_up_tokenization_spaces": true,
65
  "cls_token": "[CLS]",
66
+ "do_lower_case": true,
67
  "mask_token": "[MASK]",
68
+ "model_max_length": 512,
69
  "pad_token": "[PAD]",
70
  "sep_token": "[SEP]",
71
+ "strip_accents": null,
72
+ "tokenize_chinese_chars": true,
73
+ "tokenizer_class": "BertTokenizer",
74
  "unk_token": "[UNK]"
75
  }