arxyzan commited on
Commit
8e71a11
1 Parent(s): ba6768b

Hezar: Upload tokenizer_config.yaml

Browse files
Files changed (1) hide show
  1. preprocessor/tokenizer_config.yaml +7 -11
preprocessor/tokenizer_config.yaml CHANGED
@@ -1,4 +1,4 @@
1
- name: bpe_tokenizer
2
  config_type: preprocessor
3
  max_length: 512
4
  truncation_strategy: longest_first
@@ -8,16 +8,12 @@ padding_strategy: longest
8
  padding_direction: right
9
  pad_to_multiple_of: 0
10
  pad_token_type_id: 0
11
- bos_token: <s>
12
- eos_token: </s>
13
- unk_token: <unk>
14
- sep_token: <sep>
15
- pad_token: <pad>
16
- cls_token: <cls>
17
- mask_token: <mask>
18
- continuing_subword_prefix: ''
19
- end_of_word_suffix: ''
20
- fuse_unk: false
21
  vocab_size: 42000
22
  min_frequency: 2
23
  limit_alphabet: 1000
 
1
+ name: wordpiece_tokenizer
2
  config_type: preprocessor
3
  max_length: 512
4
  truncation_strategy: longest_first
 
8
  padding_direction: right
9
  pad_to_multiple_of: 0
10
  pad_token_type_id: 0
11
+ unk_token: '[UNK]'
12
+ sep_token: '[SEP]'
13
+ pad_token: '[PAD]'
14
+ cls_token: '[CLS]'
15
+ mask_token: '[MASK]'
16
+ wordpieces_prefix: '##'
 
 
 
 
17
  vocab_size: 42000
18
  min_frequency: 2
19
  limit_alphabet: 1000