arxyzan commited on
Commit
889f931
1 Parent(s): 594b1ec

Upload preprocessor with huggingface_hub

Browse files
preprocessor/tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
preprocessor/tokenizer_config.yaml CHANGED
@@ -1,19 +1,14 @@
1
- name: wordpiece_tokenizer
2
  config_type: preprocessor
3
- max_length: 512
4
- truncation_strategy: longest_first
5
- truncation_direction: right
6
- stride: 0
7
- padding_strategy: longest
8
- padding_direction: right
9
- pad_to_multiple_of: 0
10
  pad_token_id: 0
11
- pad_token: '[PAD]'
12
- pad_token_type_id: 0
13
- unk_token: '[UNK]'
14
- wordpieces_prefix: '##'
15
  train_config:
16
- name: wordpiece_tokenizer
17
  config_type: preprocessor
18
  vocab_size: 30000
19
  min_frequency: 2
 
1
+ name: bpe_tokenizer
2
  config_type: preprocessor
3
+ truncation_strategy: no_truncation
4
+ padding_strategy: no_padding
 
 
 
 
 
5
  pad_token_id: 0
6
+ pad_token: <pad>
7
+ continuing_subword_prefix: ''
8
+ end_of_word_suffix: ''
9
+ fuse_unk: false
10
  train_config:
11
+ name: bpe_tokenizer
12
  config_type: preprocessor
13
  vocab_size: 30000
14
  min_frequency: 2