onlydj96 commited on
Commit
6c1c00c
1 Parent(s): d0afcd2

add tokenizer

Browse files
added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "</s>": 32001,
3
+ "<s>": 32000
4
+ }
special_tokens_map.json CHANGED
@@ -7,7 +7,7 @@
7
  "single_word": false
8
  },
9
  "cls_token": {
10
- "content": "<s>",
11
  "lstrip": false,
12
  "normalized": true,
13
  "rstrip": false,
@@ -21,21 +21,21 @@
21
  "single_word": false
22
  },
23
  "mask_token": {
24
- "content": "<mask>",
25
  "lstrip": true,
26
  "normalized": true,
27
  "rstrip": false,
28
  "single_word": false
29
  },
30
  "pad_token": {
31
- "content": "<pad>",
32
  "lstrip": false,
33
  "normalized": true,
34
  "rstrip": false,
35
  "single_word": false
36
  },
37
  "sep_token": {
38
- "content": "</s>",
39
  "lstrip": false,
40
  "normalized": true,
41
  "rstrip": false,
 
7
  "single_word": false
8
  },
9
  "cls_token": {
10
+ "content": "[CLS]",
11
  "lstrip": false,
12
  "normalized": true,
13
  "rstrip": false,
 
21
  "single_word": false
22
  },
23
  "mask_token": {
24
+ "content": "[MASK]",
25
  "lstrip": true,
26
  "normalized": true,
27
  "rstrip": false,
28
  "single_word": false
29
  },
30
  "pad_token": {
31
+ "content": "[PAD]",
32
  "lstrip": false,
33
  "normalized": true,
34
  "rstrip": false,
35
  "single_word": false
36
  },
37
  "sep_token": {
38
+ "content": "[SEP]",
39
  "lstrip": false,
40
  "normalized": true,
41
  "rstrip": false,
tokenizer_config.json CHANGED
@@ -9,7 +9,7 @@
9
  },
10
  "cls_token": {
11
  "__type": "AddedToken",
12
- "content": "<s>",
13
  "lstrip": false,
14
  "normalized": true,
15
  "rstrip": false,
@@ -27,7 +27,7 @@
27
  },
28
  "mask_token": {
29
  "__type": "AddedToken",
30
- "content": "<mask>",
31
  "lstrip": true,
32
  "normalized": true,
33
  "rstrip": false,
@@ -37,7 +37,7 @@
37
  "never_split": null,
38
  "pad_token": {
39
  "__type": "AddedToken",
40
- "content": "<pad>",
41
  "lstrip": false,
42
  "normalized": true,
43
  "rstrip": false,
@@ -45,7 +45,7 @@
45
  },
46
  "sep_token": {
47
  "__type": "AddedToken",
48
- "content": "</s>",
49
  "lstrip": false,
50
  "normalized": true,
51
  "rstrip": false,
 
9
  },
10
  "cls_token": {
11
  "__type": "AddedToken",
12
+ "content": "[CLS]",
13
  "lstrip": false,
14
  "normalized": true,
15
  "rstrip": false,
 
27
  },
28
  "mask_token": {
29
  "__type": "AddedToken",
30
+ "content": "[MASK]",
31
  "lstrip": true,
32
  "normalized": true,
33
  "rstrip": false,
 
37
  "never_split": null,
38
  "pad_token": {
39
  "__type": "AddedToken",
40
+ "content": "[PAD]",
41
  "lstrip": false,
42
  "normalized": true,
43
  "rstrip": false,
 
45
  },
46
  "sep_token": {
47
  "__type": "AddedToken",
48
+ "content": "[SEP]",
49
  "lstrip": false,
50
  "normalized": true,
51
  "rstrip": false,
vocab.txt CHANGED
@@ -1,8 +1,8 @@
1
- <pad>
2
  [UNK]
3
- <s>
4
- </s>
5
- <mask>
6
  0
7
  1
8
  2
 
1
+ [PAD]
2
  [UNK]
3
+ [CLS]
4
+ [SEP]
5
+ [MASK]
6
  0
7
  1
8
  2