arxyzan commited on
Commit
ba6768b
1 Parent(s): e5cba47

Hezar: Upload tokenizer_config.yaml

Browse files
Files changed (1) hide show
  1. preprocessor/tokenizer_config.yaml +15 -9
preprocessor/tokenizer_config.yaml CHANGED
@@ -1,19 +1,25 @@
1
  name: bpe_tokenizer
2
  config_type: preprocessor
 
3
  truncation_strategy: longest_first
4
  truncation_direction: right
 
5
  padding_strategy: longest
6
  padding_direction: right
7
- pad_token_id: 0
8
- pad_token: <pad>
9
  pad_token_type_id: 0
 
 
 
 
 
 
 
10
  continuing_subword_prefix: ''
11
  end_of_word_suffix: ''
12
  fuse_unk: false
13
- train_config:
14
- name: bpe_tokenizer
15
- config_type: preprocessor
16
- vocab_size: 30000
17
- min_frequency: 2
18
- limit_alphabet: 1000
19
- show_progress: true
 
1
  name: bpe_tokenizer
2
  config_type: preprocessor
3
+ max_length: 512
4
  truncation_strategy: longest_first
5
  truncation_direction: right
6
+ stride: 0
7
  padding_strategy: longest
8
  padding_direction: right
9
+ pad_to_multiple_of: 0
 
10
  pad_token_type_id: 0
11
+ bos_token: <s>
12
+ eos_token: </s>
13
+ unk_token: <unk>
14
+ sep_token: <sep>
15
+ pad_token: <pad>
16
+ cls_token: <cls>
17
+ mask_token: <mask>
18
  continuing_subword_prefix: ''
19
  end_of_word_suffix: ''
20
  fuse_unk: false
21
+ vocab_size: 42000
22
+ min_frequency: 2
23
+ limit_alphabet: 1000
24
+ initial_alphabet: []
25
+ show_progress: true