name: bpe_tokenizer config_type: preprocessor truncation_strategy: no_truncation padding_strategy: no_padding continuing_subword_prefix: '' end_of_word_suffix: '' fuse_unk: false train_config: name: bpe_tokenizer config_type: preprocessor vocab_size: 30000 min_frequency: 2 limit_alphabet: 1000 show_progress: true