{ | |
"num_threads": 224, | |
"split_by_whitespace": true, | |
"model_type": "unigram", | |
"vocab_size": 250680, | |
"character_coverage": 0.9999, | |
"byte_fallback": true, | |
"split_by_number": true, | |
"split_digits": true, | |
"normalization_rule_name": "nfkc", | |
"max_sentence_length": 4096, | |
"shuffle_input_sentence": true, | |
"input_sentence_size": 0, | |
"train_extremely_large_corpus": true, | |
"allow_whitespace_only_pieces": true, | |
"required_chars": "", | |
"remove_extra_whitespaces": false, | |
"user_defined_symbols": [ | |
"<s>", | |
"</s>", | |
"<pad>", | |
"<eod>", | |
"<placeholder_tok_0>", | |
"<placeholder_tok_1>", | |
"<placeholder_tok_2>", | |
"<placeholder_tok_3>", | |
"<placeholder_tok_4>", | |
"<placeholder_tok_5>", | |
"<placeholder_tok_6>", | |
"<placeholder_tok_7>", | |
"<placeholder_tok_8>", | |
"<placeholder_tok_9>", | |
"<placeholder_tok_10>", | |
"<placeholder_tok_11>", | |
"<placeholder_tok_12>", | |
"<placeholder_tok_13>", | |
"<placeholder_tok_14>", | |
"<placeholder_tok_15>", | |
"<placeholder_tok_16>", | |
"<placeholder_tok_17>", | |
"<placeholder_tok_18>", | |
"<placeholder_tok_19>", | |
"<placeholder_tok_20>", | |
"<placeholder_tok_21>", | |
"<placeholder_tok_22>", | |
"<placeholder_tok_23>", | |
"<placeholder_tok_24>", | |
"<placeholder_tok_25>", | |
"<placeholder_tok_26>", | |
"<placeholder_tok_27>", | |
"<placeholder_tok_28>", | |
"<placeholder_tok_29>", | |
"<placeholder_tok_30>", | |
"<placeholder_tok_31>", | |
"<placeholder_tok_32>", | |
"<placeholder_tok_33>", | |
"<placeholder_tok_34>", | |
"<placeholder_tok_35>", | |
"<placeholder_tok_36>", | |
"<placeholder_tok_37>", | |
"<placeholder_tok_38>", | |
"<placeholder_tok_39>", | |
"<placeholder_tok_40>", | |
"<placeholder_tok_41>", | |
"<placeholder_tok_42>", | |
"<placeholder_tok_43>", | |
"<placeholder_tok_44>", | |
"<placeholder_tok_45>", | |
"<placeholder_tok_46>", | |
"<placeholder_tok_47>", | |
"<placeholder_tok_48>", | |
"<placeholder_tok_49>", | |
"<placeholder_tok_50>", | |
"<placeholder_tok_51>", | |
"<placeholder_tok_52>", | |
"<placeholder_tok_53>", | |
"<placeholder_tok_54>", | |
"<placeholder_tok_55>", | |
"<placeholder_tok_56>", | |
"<placeholder_tok_57>", | |
"<placeholder_tok_58>", | |
"<placeholder_tok_59>", | |
"<placeholder_tok_60>", | |
"<placeholder_tok_61>", | |
"<placeholder_tok_62>", | |
"<placeholder_tok_63>", | |
"<placeholder_tok_64>", | |
"<placeholder_tok_65>", | |
"<placeholder_tok_66>", | |
"<placeholder_tok_67>", | |
"<placeholder_tok_68>", | |
"<placeholder_tok_69>", | |
"<placeholder_tok_70>", | |
"<placeholder_tok_71>", | |
"<placeholder_tok_72>", | |
"<placeholder_tok_73>", | |
"<placeholder_tok_74>", | |
"<placeholder_tok_75>", | |
"<placeholder_tok_76>", | |
"<placeholder_tok_77>", | |
"<placeholder_tok_78>", | |
"<placeholder_tok_79>", | |
"<placeholder_tok_80>", | |
"<placeholder_tok_81>", | |
"<placeholder_tok_82>", | |
"<placeholder_tok_83>", | |
"<placeholder_tok_84>", | |
"<placeholder_tok_85>", | |
"<placeholder_tok_86>", | |
"<placeholder_tok_87>", | |
"<placeholder_tok_88>", | |
"<placeholder_tok_89>", | |
"<placeholder_tok_90>", | |
"<placeholder_tok_91>", | |
"<placeholder_tok_92>", | |
"<placeholder_tok_93>", | |
"<placeholder_tok_94>", | |
"<placeholder_tok_95>", | |
"<placeholder_tok_96>", | |
"<placeholder_tok_97>", | |
"<placeholder_tok_98>", | |
"<placeholder_tok_99>", | |
"<placeholder_tok_100>", | |
"<placeholder_tok_101>", | |
"<placeholder_tok_102>", | |
"<placeholder_tok_103>", | |
"<placeholder_tok_104>", | |
"<placeholder_tok_105>", | |
"<placeholder_tok_106>", | |
"<placeholder_tok_107>", | |
"<placeholder_tok_108>", | |
"<placeholder_tok_109>", | |
"<placeholder_tok_110>", | |
"<placeholder_tok_111>", | |
"<placeholder_tok_112>", | |
"<placeholder_tok_113>", | |
"<placeholder_tok_114>", | |
"<placeholder_tok_115>", | |
"<placeholder_tok_116>", | |
"<placeholder_tok_117>", | |
"<placeholder_tok_118>", | |
"<placeholder_tok_119>", | |
"<placeholder_tok_120>", | |
"<placeholder_tok_121>", | |
"<placeholder_tok_122>", | |
"<placeholder_tok_123>", | |
"<placeholder_tok_124>", | |
"<placeholder_tok_125>", | |
"<placeholder_tok_126>", | |
"<placeholder_tok_127>", | |
"<placeholder_tok_128>", | |
"<placeholder_tok_129>", | |
"<placeholder_tok_130>", | |
"<placeholder_tok_131>", | |
"<placeholder_tok_132>", | |
"<placeholder_tok_133>", | |
"<placeholder_tok_134>", | |
"<placeholder_tok_135>", | |
"<placeholder_tok_136>", | |
"<placeholder_tok_137>", | |
"<placeholder_tok_138>", | |
"<placeholder_tok_139>", | |
"<placeholder_tok_140>", | |
"<placeholder_tok_141>", | |
"<placeholder_tok_142>", | |
"<placeholder_tok_143>", | |
"<placeholder_tok_144>", | |
"<placeholder_tok_145>", | |
"<placeholder_tok_146>", | |
"<placeholder_tok_147>", | |
"<placeholder_tok_148>", | |
"<placeholder_tok_149>", | |
"<placeholder_tok_150>", | |
"<placeholder_tok_151>", | |
"<placeholder_tok_152>", | |
"<placeholder_tok_153>", | |
"<placeholder_tok_154>", | |
"<placeholder_tok_155>", | |
"<placeholder_tok_156>", | |
"<placeholder_tok_157>", | |
"<placeholder_tok_158>", | |
"<placeholder_tok_159>", | |
"<placeholder_tok_160>", | |
"<placeholder_tok_161>", | |
"<placeholder_tok_162>", | |
"<placeholder_tok_163>", | |
"<placeholder_tok_164>", | |
"<placeholder_tok_165>", | |
"<placeholder_tok_166>", | |
"<placeholder_tok_167>", | |
"<placeholder_tok_168>", | |
"<placeholder_tok_169>", | |
"<placeholder_tok_170>", | |
"<placeholder_tok_171>", | |
"<placeholder_tok_172>", | |
"<placeholder_tok_173>", | |
"<placeholder_tok_174>", | |
"<placeholder_tok_175>", | |
"<placeholder_tok_176>", | |
"<placeholder_tok_177>", | |
"<placeholder_tok_178>", | |
"<placeholder_tok_179>", | |
"<placeholder_tok_180>", | |
"<placeholder_tok_181>", | |
"<placeholder_tok_182>", | |
"<placeholder_tok_183>", | |
"<placeholder_tok_184>", | |
"<placeholder_tok_185>", | |
"<placeholder_tok_186>", | |
"<placeholder_tok_187>", | |
"<placeholder_tok_188>", | |
"<placeholder_tok_189>", | |
"<placeholder_tok_190>", | |
"<placeholder_tok_191>", | |
"<placeholder_tok_192>", | |
"<placeholder_tok_193>", | |
"<placeholder_tok_194>", | |
"<placeholder_tok_195>", | |
"<placeholder_tok_196>", | |
"<placeholder_tok_197>", | |
"<placeholder_tok_198>", | |
"<placeholder_tok_199>", | |
"<placeholder_tok_200>", | |
"<placeholder_tok_201>", | |
"<placeholder_tok_202>", | |
"<placeholder_tok_203>", | |
"<placeholder_tok_204>", | |
"<placeholder_tok_205>", | |
"<placeholder_tok_206>", | |
"<placeholder_tok_207>", | |
"<placeholder_tok_208>", | |
"<placeholder_tok_209>", | |
"<placeholder_tok_210>", | |
"<placeholder_tok_211>", | |
"<placeholder_tok_212>", | |
"<placeholder_tok_213>", | |
"<placeholder_tok_214>", | |
"<placeholder_tok_215>", | |
"<placeholder_tok_216>", | |
"<placeholder_tok_217>", | |
"<placeholder_tok_218>", | |
"<placeholder_tok_219>", | |
"<placeholder_tok_220>", | |
"<placeholder_tok_221>", | |
"<placeholder_tok_222>", | |
"<placeholder_tok_223>", | |
"<placeholder_tok_224>", | |
"<placeholder_tok_225>", | |
"<placeholder_tok_226>", | |
"<placeholder_tok_227>", | |
"<placeholder_tok_228>", | |
"<placeholder_tok_229>", | |
"<placeholder_tok_230>", | |
"<placeholder_tok_231>", | |
"<placeholder_tok_232>", | |
"<placeholder_tok_233>", | |
"<placeholder_tok_234>", | |
"<placeholder_tok_235>", | |
"<placeholder_tok_236>", | |
"<placeholder_tok_237>", | |
"<placeholder_tok_238>", | |
"<placeholder_tok_239>", | |
"<placeholder_tok_240>", | |
"<placeholder_tok_241>", | |
"<placeholder_tok_242>", | |
"<placeholder_tok_243>", | |
"<placeholder_tok_244>", | |
"<placeholder_tok_245>", | |
"<placeholder_tok_246>", | |
"<placeholder_tok_247>", | |
"<placeholder_tok_248>", | |
"<placeholder_tok_249>", | |
"<placeholder_tok_250>", | |
"<placeholder_tok_251>", | |
"<placeholder_tok_252>", | |
"<placeholder_tok_253>", | |
"<placeholder_tok_254>", | |
"<placeholder_tok_255>" | |
], | |
"datasets_dir": "/home/fhgiais/gptx_ablations/bias_analysis/data/tokenizer/temp/", | |
"save_dir": "/home/fhgiais/gptx_ablations/bias_analysis/tokenizer/24", | |
"text_key": "text", | |
"cache_dir": "/home/fhgiais/gptx_ablations/bias_analysis/tokenizer/24/cache", | |
"library": "sentencepiece", | |
"auto_map": { | |
"AutoTokenizer": [ | |
"gptx_tokenizer.SPTokenizer", | |
null | |
] | |
}, | |
"tokenizer_class": "SPTokenizer" | |
} |