Upload preprocessor with huggingface_hub

Files changed (2) hide show

preprocessor/tokenizer.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

preprocessor/tokenizer_config.yaml CHANGED Viewed

@@ -1,19 +1,14 @@
-name: wordpiece_tokenizer
 config_type: preprocessor
-max_length: 512
-truncation_strategy: longest_first
-truncation_direction: right
-stride: 0
-padding_strategy: longest
-padding_direction: right
-pad_to_multiple_of: 0
 pad_token_id: 0
-pad_token: '[PAD]'
-pad_token_type_id: 0
-unk_token: '[UNK]'
-wordpieces_prefix: '##'
 train_config:
-  name: wordpiece_tokenizer
   config_type: preprocessor
   vocab_size: 30000
   min_frequency: 2

+name: bpe_tokenizer
 config_type: preprocessor
+truncation_strategy: no_truncation
+padding_strategy: no_padding
 pad_token_id: 0
+pad_token: <pad>
+continuing_subword_prefix: ''
+end_of_word_suffix: ''
+fuse_unk: false
 train_config:
+  name: bpe_tokenizer
   config_type: preprocessor
   vocab_size: 30000
   min_frequency: 2