File size: 9,415 Bytes
3e4fb5d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
{
    "num_threads": 224,
    "split_by_whitespace": true,
    "model_type": "unigram",
    "vocab_size": 250680,
    "character_coverage": 0.9999,
    "byte_fallback": true,
    "split_by_number": true,
    "split_digits": true,
    "normalization_rule_name": "nfkc",
    "max_sentence_length": 4096,
    "shuffle_input_sentence": true,
    "input_sentence_size": 0,
    "train_extremely_large_corpus": true,
    "allow_whitespace_only_pieces": true,
    "required_chars": "",
    "remove_extra_whitespaces": false,
    "user_defined_symbols": [
        "<s>",
        "</s>",
        "<pad>",
        "<eod>",
        "<placeholder_tok_0>",
        "<placeholder_tok_1>",
        "<placeholder_tok_2>",
        "<placeholder_tok_3>",
        "<placeholder_tok_4>",
        "<placeholder_tok_5>",
        "<placeholder_tok_6>",
        "<placeholder_tok_7>",
        "<placeholder_tok_8>",
        "<placeholder_tok_9>",
        "<placeholder_tok_10>",
        "<placeholder_tok_11>",
        "<placeholder_tok_12>",
        "<placeholder_tok_13>",
        "<placeholder_tok_14>",
        "<placeholder_tok_15>",
        "<placeholder_tok_16>",
        "<placeholder_tok_17>",
        "<placeholder_tok_18>",
        "<placeholder_tok_19>",
        "<placeholder_tok_20>",
        "<placeholder_tok_21>",
        "<placeholder_tok_22>",
        "<placeholder_tok_23>",
        "<placeholder_tok_24>",
        "<placeholder_tok_25>",
        "<placeholder_tok_26>",
        "<placeholder_tok_27>",
        "<placeholder_tok_28>",
        "<placeholder_tok_29>",
        "<placeholder_tok_30>",
        "<placeholder_tok_31>",
        "<placeholder_tok_32>",
        "<placeholder_tok_33>",
        "<placeholder_tok_34>",
        "<placeholder_tok_35>",
        "<placeholder_tok_36>",
        "<placeholder_tok_37>",
        "<placeholder_tok_38>",
        "<placeholder_tok_39>",
        "<placeholder_tok_40>",
        "<placeholder_tok_41>",
        "<placeholder_tok_42>",
        "<placeholder_tok_43>",
        "<placeholder_tok_44>",
        "<placeholder_tok_45>",
        "<placeholder_tok_46>",
        "<placeholder_tok_47>",
        "<placeholder_tok_48>",
        "<placeholder_tok_49>",
        "<placeholder_tok_50>",
        "<placeholder_tok_51>",
        "<placeholder_tok_52>",
        "<placeholder_tok_53>",
        "<placeholder_tok_54>",
        "<placeholder_tok_55>",
        "<placeholder_tok_56>",
        "<placeholder_tok_57>",
        "<placeholder_tok_58>",
        "<placeholder_tok_59>",
        "<placeholder_tok_60>",
        "<placeholder_tok_61>",
        "<placeholder_tok_62>",
        "<placeholder_tok_63>",
        "<placeholder_tok_64>",
        "<placeholder_tok_65>",
        "<placeholder_tok_66>",
        "<placeholder_tok_67>",
        "<placeholder_tok_68>",
        "<placeholder_tok_69>",
        "<placeholder_tok_70>",
        "<placeholder_tok_71>",
        "<placeholder_tok_72>",
        "<placeholder_tok_73>",
        "<placeholder_tok_74>",
        "<placeholder_tok_75>",
        "<placeholder_tok_76>",
        "<placeholder_tok_77>",
        "<placeholder_tok_78>",
        "<placeholder_tok_79>",
        "<placeholder_tok_80>",
        "<placeholder_tok_81>",
        "<placeholder_tok_82>",
        "<placeholder_tok_83>",
        "<placeholder_tok_84>",
        "<placeholder_tok_85>",
        "<placeholder_tok_86>",
        "<placeholder_tok_87>",
        "<placeholder_tok_88>",
        "<placeholder_tok_89>",
        "<placeholder_tok_90>",
        "<placeholder_tok_91>",
        "<placeholder_tok_92>",
        "<placeholder_tok_93>",
        "<placeholder_tok_94>",
        "<placeholder_tok_95>",
        "<placeholder_tok_96>",
        "<placeholder_tok_97>",
        "<placeholder_tok_98>",
        "<placeholder_tok_99>",
        "<placeholder_tok_100>",
        "<placeholder_tok_101>",
        "<placeholder_tok_102>",
        "<placeholder_tok_103>",
        "<placeholder_tok_104>",
        "<placeholder_tok_105>",
        "<placeholder_tok_106>",
        "<placeholder_tok_107>",
        "<placeholder_tok_108>",
        "<placeholder_tok_109>",
        "<placeholder_tok_110>",
        "<placeholder_tok_111>",
        "<placeholder_tok_112>",
        "<placeholder_tok_113>",
        "<placeholder_tok_114>",
        "<placeholder_tok_115>",
        "<placeholder_tok_116>",
        "<placeholder_tok_117>",
        "<placeholder_tok_118>",
        "<placeholder_tok_119>",
        "<placeholder_tok_120>",
        "<placeholder_tok_121>",
        "<placeholder_tok_122>",
        "<placeholder_tok_123>",
        "<placeholder_tok_124>",
        "<placeholder_tok_125>",
        "<placeholder_tok_126>",
        "<placeholder_tok_127>",
        "<placeholder_tok_128>",
        "<placeholder_tok_129>",
        "<placeholder_tok_130>",
        "<placeholder_tok_131>",
        "<placeholder_tok_132>",
        "<placeholder_tok_133>",
        "<placeholder_tok_134>",
        "<placeholder_tok_135>",
        "<placeholder_tok_136>",
        "<placeholder_tok_137>",
        "<placeholder_tok_138>",
        "<placeholder_tok_139>",
        "<placeholder_tok_140>",
        "<placeholder_tok_141>",
        "<placeholder_tok_142>",
        "<placeholder_tok_143>",
        "<placeholder_tok_144>",
        "<placeholder_tok_145>",
        "<placeholder_tok_146>",
        "<placeholder_tok_147>",
        "<placeholder_tok_148>",
        "<placeholder_tok_149>",
        "<placeholder_tok_150>",
        "<placeholder_tok_151>",
        "<placeholder_tok_152>",
        "<placeholder_tok_153>",
        "<placeholder_tok_154>",
        "<placeholder_tok_155>",
        "<placeholder_tok_156>",
        "<placeholder_tok_157>",
        "<placeholder_tok_158>",
        "<placeholder_tok_159>",
        "<placeholder_tok_160>",
        "<placeholder_tok_161>",
        "<placeholder_tok_162>",
        "<placeholder_tok_163>",
        "<placeholder_tok_164>",
        "<placeholder_tok_165>",
        "<placeholder_tok_166>",
        "<placeholder_tok_167>",
        "<placeholder_tok_168>",
        "<placeholder_tok_169>",
        "<placeholder_tok_170>",
        "<placeholder_tok_171>",
        "<placeholder_tok_172>",
        "<placeholder_tok_173>",
        "<placeholder_tok_174>",
        "<placeholder_tok_175>",
        "<placeholder_tok_176>",
        "<placeholder_tok_177>",
        "<placeholder_tok_178>",
        "<placeholder_tok_179>",
        "<placeholder_tok_180>",
        "<placeholder_tok_181>",
        "<placeholder_tok_182>",
        "<placeholder_tok_183>",
        "<placeholder_tok_184>",
        "<placeholder_tok_185>",
        "<placeholder_tok_186>",
        "<placeholder_tok_187>",
        "<placeholder_tok_188>",
        "<placeholder_tok_189>",
        "<placeholder_tok_190>",
        "<placeholder_tok_191>",
        "<placeholder_tok_192>",
        "<placeholder_tok_193>",
        "<placeholder_tok_194>",
        "<placeholder_tok_195>",
        "<placeholder_tok_196>",
        "<placeholder_tok_197>",
        "<placeholder_tok_198>",
        "<placeholder_tok_199>",
        "<placeholder_tok_200>",
        "<placeholder_tok_201>",
        "<placeholder_tok_202>",
        "<placeholder_tok_203>",
        "<placeholder_tok_204>",
        "<placeholder_tok_205>",
        "<placeholder_tok_206>",
        "<placeholder_tok_207>",
        "<placeholder_tok_208>",
        "<placeholder_tok_209>",
        "<placeholder_tok_210>",
        "<placeholder_tok_211>",
        "<placeholder_tok_212>",
        "<placeholder_tok_213>",
        "<placeholder_tok_214>",
        "<placeholder_tok_215>",
        "<placeholder_tok_216>",
        "<placeholder_tok_217>",
        "<placeholder_tok_218>",
        "<placeholder_tok_219>",
        "<placeholder_tok_220>",
        "<placeholder_tok_221>",
        "<placeholder_tok_222>",
        "<placeholder_tok_223>",
        "<placeholder_tok_224>",
        "<placeholder_tok_225>",
        "<placeholder_tok_226>",
        "<placeholder_tok_227>",
        "<placeholder_tok_228>",
        "<placeholder_tok_229>",
        "<placeholder_tok_230>",
        "<placeholder_tok_231>",
        "<placeholder_tok_232>",
        "<placeholder_tok_233>",
        "<placeholder_tok_234>",
        "<placeholder_tok_235>",
        "<placeholder_tok_236>",
        "<placeholder_tok_237>",
        "<placeholder_tok_238>",
        "<placeholder_tok_239>",
        "<placeholder_tok_240>",
        "<placeholder_tok_241>",
        "<placeholder_tok_242>",
        "<placeholder_tok_243>",
        "<placeholder_tok_244>",
        "<placeholder_tok_245>",
        "<placeholder_tok_246>",
        "<placeholder_tok_247>",
        "<placeholder_tok_248>",
        "<placeholder_tok_249>",
        "<placeholder_tok_250>",
        "<placeholder_tok_251>",
        "<placeholder_tok_252>",
        "<placeholder_tok_253>",
        "<placeholder_tok_254>",
        "<placeholder_tok_255>"
    ],
    "datasets_dir": "/home/fhgiais/gptx_ablations/bias_analysis/data/tokenizer/temp/",
    "save_dir": "/home/fhgiais/gptx_ablations/bias_analysis/tokenizer/24",
    "text_key": "text",
    "cache_dir": "/home/fhgiais/gptx_ablations/bias_analysis/tokenizer/24/cache",
    "library": "sentencepiece",
    "auto_map": {
        "AutoTokenizer": [
            "gptx_tokenizer.SPTokenizer",
            null
        ]
    },
    "tokenizer_class": "SPTokenizer"
}