File size: 9,415 Bytes
3e4fb5d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 |
{
"num_threads": 224,
"split_by_whitespace": true,
"model_type": "unigram",
"vocab_size": 250680,
"character_coverage": 0.9999,
"byte_fallback": true,
"split_by_number": true,
"split_digits": true,
"normalization_rule_name": "nfkc",
"max_sentence_length": 4096,
"shuffle_input_sentence": true,
"input_sentence_size": 0,
"train_extremely_large_corpus": true,
"allow_whitespace_only_pieces": true,
"required_chars": "",
"remove_extra_whitespaces": false,
"user_defined_symbols": [
"<s>",
"</s>",
"<pad>",
"<eod>",
"<placeholder_tok_0>",
"<placeholder_tok_1>",
"<placeholder_tok_2>",
"<placeholder_tok_3>",
"<placeholder_tok_4>",
"<placeholder_tok_5>",
"<placeholder_tok_6>",
"<placeholder_tok_7>",
"<placeholder_tok_8>",
"<placeholder_tok_9>",
"<placeholder_tok_10>",
"<placeholder_tok_11>",
"<placeholder_tok_12>",
"<placeholder_tok_13>",
"<placeholder_tok_14>",
"<placeholder_tok_15>",
"<placeholder_tok_16>",
"<placeholder_tok_17>",
"<placeholder_tok_18>",
"<placeholder_tok_19>",
"<placeholder_tok_20>",
"<placeholder_tok_21>",
"<placeholder_tok_22>",
"<placeholder_tok_23>",
"<placeholder_tok_24>",
"<placeholder_tok_25>",
"<placeholder_tok_26>",
"<placeholder_tok_27>",
"<placeholder_tok_28>",
"<placeholder_tok_29>",
"<placeholder_tok_30>",
"<placeholder_tok_31>",
"<placeholder_tok_32>",
"<placeholder_tok_33>",
"<placeholder_tok_34>",
"<placeholder_tok_35>",
"<placeholder_tok_36>",
"<placeholder_tok_37>",
"<placeholder_tok_38>",
"<placeholder_tok_39>",
"<placeholder_tok_40>",
"<placeholder_tok_41>",
"<placeholder_tok_42>",
"<placeholder_tok_43>",
"<placeholder_tok_44>",
"<placeholder_tok_45>",
"<placeholder_tok_46>",
"<placeholder_tok_47>",
"<placeholder_tok_48>",
"<placeholder_tok_49>",
"<placeholder_tok_50>",
"<placeholder_tok_51>",
"<placeholder_tok_52>",
"<placeholder_tok_53>",
"<placeholder_tok_54>",
"<placeholder_tok_55>",
"<placeholder_tok_56>",
"<placeholder_tok_57>",
"<placeholder_tok_58>",
"<placeholder_tok_59>",
"<placeholder_tok_60>",
"<placeholder_tok_61>",
"<placeholder_tok_62>",
"<placeholder_tok_63>",
"<placeholder_tok_64>",
"<placeholder_tok_65>",
"<placeholder_tok_66>",
"<placeholder_tok_67>",
"<placeholder_tok_68>",
"<placeholder_tok_69>",
"<placeholder_tok_70>",
"<placeholder_tok_71>",
"<placeholder_tok_72>",
"<placeholder_tok_73>",
"<placeholder_tok_74>",
"<placeholder_tok_75>",
"<placeholder_tok_76>",
"<placeholder_tok_77>",
"<placeholder_tok_78>",
"<placeholder_tok_79>",
"<placeholder_tok_80>",
"<placeholder_tok_81>",
"<placeholder_tok_82>",
"<placeholder_tok_83>",
"<placeholder_tok_84>",
"<placeholder_tok_85>",
"<placeholder_tok_86>",
"<placeholder_tok_87>",
"<placeholder_tok_88>",
"<placeholder_tok_89>",
"<placeholder_tok_90>",
"<placeholder_tok_91>",
"<placeholder_tok_92>",
"<placeholder_tok_93>",
"<placeholder_tok_94>",
"<placeholder_tok_95>",
"<placeholder_tok_96>",
"<placeholder_tok_97>",
"<placeholder_tok_98>",
"<placeholder_tok_99>",
"<placeholder_tok_100>",
"<placeholder_tok_101>",
"<placeholder_tok_102>",
"<placeholder_tok_103>",
"<placeholder_tok_104>",
"<placeholder_tok_105>",
"<placeholder_tok_106>",
"<placeholder_tok_107>",
"<placeholder_tok_108>",
"<placeholder_tok_109>",
"<placeholder_tok_110>",
"<placeholder_tok_111>",
"<placeholder_tok_112>",
"<placeholder_tok_113>",
"<placeholder_tok_114>",
"<placeholder_tok_115>",
"<placeholder_tok_116>",
"<placeholder_tok_117>",
"<placeholder_tok_118>",
"<placeholder_tok_119>",
"<placeholder_tok_120>",
"<placeholder_tok_121>",
"<placeholder_tok_122>",
"<placeholder_tok_123>",
"<placeholder_tok_124>",
"<placeholder_tok_125>",
"<placeholder_tok_126>",
"<placeholder_tok_127>",
"<placeholder_tok_128>",
"<placeholder_tok_129>",
"<placeholder_tok_130>",
"<placeholder_tok_131>",
"<placeholder_tok_132>",
"<placeholder_tok_133>",
"<placeholder_tok_134>",
"<placeholder_tok_135>",
"<placeholder_tok_136>",
"<placeholder_tok_137>",
"<placeholder_tok_138>",
"<placeholder_tok_139>",
"<placeholder_tok_140>",
"<placeholder_tok_141>",
"<placeholder_tok_142>",
"<placeholder_tok_143>",
"<placeholder_tok_144>",
"<placeholder_tok_145>",
"<placeholder_tok_146>",
"<placeholder_tok_147>",
"<placeholder_tok_148>",
"<placeholder_tok_149>",
"<placeholder_tok_150>",
"<placeholder_tok_151>",
"<placeholder_tok_152>",
"<placeholder_tok_153>",
"<placeholder_tok_154>",
"<placeholder_tok_155>",
"<placeholder_tok_156>",
"<placeholder_tok_157>",
"<placeholder_tok_158>",
"<placeholder_tok_159>",
"<placeholder_tok_160>",
"<placeholder_tok_161>",
"<placeholder_tok_162>",
"<placeholder_tok_163>",
"<placeholder_tok_164>",
"<placeholder_tok_165>",
"<placeholder_tok_166>",
"<placeholder_tok_167>",
"<placeholder_tok_168>",
"<placeholder_tok_169>",
"<placeholder_tok_170>",
"<placeholder_tok_171>",
"<placeholder_tok_172>",
"<placeholder_tok_173>",
"<placeholder_tok_174>",
"<placeholder_tok_175>",
"<placeholder_tok_176>",
"<placeholder_tok_177>",
"<placeholder_tok_178>",
"<placeholder_tok_179>",
"<placeholder_tok_180>",
"<placeholder_tok_181>",
"<placeholder_tok_182>",
"<placeholder_tok_183>",
"<placeholder_tok_184>",
"<placeholder_tok_185>",
"<placeholder_tok_186>",
"<placeholder_tok_187>",
"<placeholder_tok_188>",
"<placeholder_tok_189>",
"<placeholder_tok_190>",
"<placeholder_tok_191>",
"<placeholder_tok_192>",
"<placeholder_tok_193>",
"<placeholder_tok_194>",
"<placeholder_tok_195>",
"<placeholder_tok_196>",
"<placeholder_tok_197>",
"<placeholder_tok_198>",
"<placeholder_tok_199>",
"<placeholder_tok_200>",
"<placeholder_tok_201>",
"<placeholder_tok_202>",
"<placeholder_tok_203>",
"<placeholder_tok_204>",
"<placeholder_tok_205>",
"<placeholder_tok_206>",
"<placeholder_tok_207>",
"<placeholder_tok_208>",
"<placeholder_tok_209>",
"<placeholder_tok_210>",
"<placeholder_tok_211>",
"<placeholder_tok_212>",
"<placeholder_tok_213>",
"<placeholder_tok_214>",
"<placeholder_tok_215>",
"<placeholder_tok_216>",
"<placeholder_tok_217>",
"<placeholder_tok_218>",
"<placeholder_tok_219>",
"<placeholder_tok_220>",
"<placeholder_tok_221>",
"<placeholder_tok_222>",
"<placeholder_tok_223>",
"<placeholder_tok_224>",
"<placeholder_tok_225>",
"<placeholder_tok_226>",
"<placeholder_tok_227>",
"<placeholder_tok_228>",
"<placeholder_tok_229>",
"<placeholder_tok_230>",
"<placeholder_tok_231>",
"<placeholder_tok_232>",
"<placeholder_tok_233>",
"<placeholder_tok_234>",
"<placeholder_tok_235>",
"<placeholder_tok_236>",
"<placeholder_tok_237>",
"<placeholder_tok_238>",
"<placeholder_tok_239>",
"<placeholder_tok_240>",
"<placeholder_tok_241>",
"<placeholder_tok_242>",
"<placeholder_tok_243>",
"<placeholder_tok_244>",
"<placeholder_tok_245>",
"<placeholder_tok_246>",
"<placeholder_tok_247>",
"<placeholder_tok_248>",
"<placeholder_tok_249>",
"<placeholder_tok_250>",
"<placeholder_tok_251>",
"<placeholder_tok_252>",
"<placeholder_tok_253>",
"<placeholder_tok_254>",
"<placeholder_tok_255>"
],
"datasets_dir": "/home/fhgiais/gptx_ablations/bias_analysis/data/tokenizer/temp/",
"save_dir": "/home/fhgiais/gptx_ablations/bias_analysis/tokenizer/24",
"text_key": "text",
"cache_dir": "/home/fhgiais/gptx_ablations/bias_analysis/tokenizer/24/cache",
"library": "sentencepiece",
"auto_map": {
"AutoTokenizer": [
"gptx_tokenizer.SPTokenizer",
null
]
},
"tokenizer_class": "SPTokenizer"
} |