arxyzan commited on
Commit
dd13632
1 Parent(s): 81eab5b

Hezar: Upload tokenizer and config

Browse files
Files changed (1) hide show
  1. preprocessor/tokenizer_config.yaml +37 -0
preprocessor/tokenizer_config.yaml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: bpe_tokenizer
2
+ config_type: preprocessor
3
+ max_length: 512
4
+ truncation_strategy: longest_first
5
+ truncation_direction: right
6
+ stride: 0
7
+ padding_strategy: longest
8
+ padding_direction: right
9
+ pad_to_multiple_of: 0
10
+ pad_token_type_id: 0
11
+ bos_token: <s>
12
+ eos_token: </s>
13
+ unk_token: <unk>
14
+ sep_token: <sep>
15
+ pad_token: </s>
16
+ cls_token: <cls>
17
+ mask_token: <mask>
18
+ special_tokens:
19
+ - <s>
20
+ - <pad>
21
+ - </s>
22
+ - <unk>
23
+ - <mask>
24
+ - <|endoftext|>
25
+ - <|startoftext|>
26
+ - <nl>
27
+ - <hs>
28
+ - <sep>
29
+ - <cls>
30
+ continuing_subword_prefix: ''
31
+ end_of_word_suffix: ''
32
+ fuse_unk: false
33
+ vocab_size: 42000
34
+ min_frequency: 2
35
+ limit_alphabet: 1000
36
+ initial_alphabet: []
37
+ show_progress: true