add tokenizer
Browse files- special_tokens_map.json +1 -1
- tokenizer.json +81 -0
special_tokens_map.json
CHANGED
@@ -1 +1 @@
|
|
1 |
-
{"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>", "mask_token": "<mask>"}
|
|
|
1 |
+
{"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>", "mask_token": "<mask>", "additional_special_tokens": ["<P01>", "<P02>", "<P03>", "<P04>", "<P05>", "<P06>", "<P07>", "<P08>", "<P09>"]}
|
tokenizer.json
CHANGED
@@ -2063,6 +2063,87 @@
|
|
2063 |
"rstrip": false,
|
2064 |
"normalized": false,
|
2065 |
"special": true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2066 |
}
|
2067 |
],
|
2068 |
"normalizer": {
|
|
|
2063 |
"rstrip": false,
|
2064 |
"normalized": false,
|
2065 |
"special": true
|
2066 |
+
},
|
2067 |
+
{
|
2068 |
+
"id": 30000,
|
2069 |
+
"content": "<P01>",
|
2070 |
+
"single_word": false,
|
2071 |
+
"lstrip": false,
|
2072 |
+
"rstrip": false,
|
2073 |
+
"normalized": false,
|
2074 |
+
"special": true
|
2075 |
+
},
|
2076 |
+
{
|
2077 |
+
"id": 30001,
|
2078 |
+
"content": "<P02>",
|
2079 |
+
"single_word": false,
|
2080 |
+
"lstrip": false,
|
2081 |
+
"rstrip": false,
|
2082 |
+
"normalized": false,
|
2083 |
+
"special": true
|
2084 |
+
},
|
2085 |
+
{
|
2086 |
+
"id": 30002,
|
2087 |
+
"content": "<P03>",
|
2088 |
+
"single_word": false,
|
2089 |
+
"lstrip": false,
|
2090 |
+
"rstrip": false,
|
2091 |
+
"normalized": false,
|
2092 |
+
"special": true
|
2093 |
+
},
|
2094 |
+
{
|
2095 |
+
"id": 30003,
|
2096 |
+
"content": "<P04>",
|
2097 |
+
"single_word": false,
|
2098 |
+
"lstrip": false,
|
2099 |
+
"rstrip": false,
|
2100 |
+
"normalized": false,
|
2101 |
+
"special": true
|
2102 |
+
},
|
2103 |
+
{
|
2104 |
+
"id": 30004,
|
2105 |
+
"content": "<P05>",
|
2106 |
+
"single_word": false,
|
2107 |
+
"lstrip": false,
|
2108 |
+
"rstrip": false,
|
2109 |
+
"normalized": false,
|
2110 |
+
"special": true
|
2111 |
+
},
|
2112 |
+
{
|
2113 |
+
"id": 30005,
|
2114 |
+
"content": "<P06>",
|
2115 |
+
"single_word": false,
|
2116 |
+
"lstrip": false,
|
2117 |
+
"rstrip": false,
|
2118 |
+
"normalized": false,
|
2119 |
+
"special": true
|
2120 |
+
},
|
2121 |
+
{
|
2122 |
+
"id": 30006,
|
2123 |
+
"content": "<P07>",
|
2124 |
+
"single_word": false,
|
2125 |
+
"lstrip": false,
|
2126 |
+
"rstrip": false,
|
2127 |
+
"normalized": false,
|
2128 |
+
"special": true
|
2129 |
+
},
|
2130 |
+
{
|
2131 |
+
"id": 30007,
|
2132 |
+
"content": "<P08>",
|
2133 |
+
"single_word": false,
|
2134 |
+
"lstrip": false,
|
2135 |
+
"rstrip": false,
|
2136 |
+
"normalized": false,
|
2137 |
+
"special": true
|
2138 |
+
},
|
2139 |
+
{
|
2140 |
+
"id": 30008,
|
2141 |
+
"content": "<P09>",
|
2142 |
+
"single_word": false,
|
2143 |
+
"lstrip": false,
|
2144 |
+
"rstrip": false,
|
2145 |
+
"normalized": false,
|
2146 |
+
"special": true
|
2147 |
}
|
2148 |
],
|
2149 |
"normalizer": {
|