arxyzan commited on
Commit
594b1ec
1 Parent(s): 9fe7716

Upload preprocessor with huggingface_hub

Browse files
preprocessor/tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
preprocessor/tokenizer_config.yaml CHANGED
@@ -1,12 +1,19 @@
1
- name: bpe_tokenizer
2
  config_type: preprocessor
3
- truncation_strategy: no_truncation
4
- padding_strategy: no_padding
5
- continuing_subword_prefix: ''
6
- end_of_word_suffix: ''
7
- fuse_unk: false
 
 
 
 
 
 
 
8
  train_config:
9
- name: bpe_tokenizer
10
  config_type: preprocessor
11
  vocab_size: 30000
12
  min_frequency: 2
 
1
+ name: wordpiece_tokenizer
2
  config_type: preprocessor
3
+ max_length: 512
4
+ truncation_strategy: longest_first
5
+ truncation_direction: right
6
+ stride: 0
7
+ padding_strategy: longest
8
+ padding_direction: right
9
+ pad_to_multiple_of: 0
10
+ pad_token_id: 0
11
+ pad_token: '[PAD]'
12
+ pad_token_type_id: 0
13
+ unk_token: '[UNK]'
14
+ wordpieces_prefix: '##'
15
  train_config:
16
+ name: wordpiece_tokenizer
17
  config_type: preprocessor
18
  vocab_size: 30000
19
  min_frequency: 2