arxyzan commited on
Commit
3b18bf5
1 Parent(s): 8e71a11

Hezar: Upload tokenizer_config.yaml

Browse files
Files changed (1) hide show
  1. preprocessor/tokenizer_config.yaml +11 -7
preprocessor/tokenizer_config.yaml CHANGED
@@ -1,4 +1,4 @@
1
- name: wordpiece_tokenizer
2
  config_type: preprocessor
3
  max_length: 512
4
  truncation_strategy: longest_first
@@ -8,12 +8,16 @@ padding_strategy: longest
8
  padding_direction: right
9
  pad_to_multiple_of: 0
10
  pad_token_type_id: 0
11
- unk_token: '[UNK]'
12
- sep_token: '[SEP]'
13
- pad_token: '[PAD]'
14
- cls_token: '[CLS]'
15
- mask_token: '[MASK]'
16
- wordpieces_prefix: '##'
 
 
 
 
17
  vocab_size: 42000
18
  min_frequency: 2
19
  limit_alphabet: 1000
 
1
+ name: bpe_tokenizer
2
  config_type: preprocessor
3
  max_length: 512
4
  truncation_strategy: longest_first
 
8
  padding_direction: right
9
  pad_to_multiple_of: 0
10
  pad_token_type_id: 0
11
+ bos_token: <s>
12
+ eos_token: </s>
13
+ unk_token: <unk>
14
+ sep_token: <sep>
15
+ pad_token: <pad>
16
+ cls_token: <cls>
17
+ mask_token: <mask>
18
+ continuing_subword_prefix: ''
19
+ end_of_word_suffix: ''
20
+ fuse_unk: false
21
  vocab_size: 42000
22
  min_frequency: 2
23
  limit_alphabet: 1000