arxyzan commited on
Commit
2b2d98f
1 Parent(s): d957b41

Hezar: Upload model and config

Browse files
Files changed (1) hide show
  1. preprocessor/tokenizer_config.yaml +36 -0
preprocessor/tokenizer_config.yaml ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: sentencepiece_unigram_tokenizer
2
+ config_type: preprocessor
3
+ pretrained_path: t5-base-fa
4
+ max_length: 512
5
+ truncation_strategy: longest_first
6
+ truncation_direction: right
7
+ stride: 0
8
+ padding_strategy: longest
9
+ padding_direction: right
10
+ pad_to_multiple_of: 0
11
+ pad_token_id: 0
12
+ pad_token: <pad>
13
+ pad_token_type_id: 0
14
+ unk_token: <unk>
15
+ special_tokens:
16
+ - <s>
17
+ - <pad>
18
+ - </s>
19
+ - <unk>
20
+ - <mask>
21
+ - <|endoftext|>
22
+ - <|startoftext|>
23
+ - <nl>
24
+ - <hs>
25
+ - <sep>
26
+ - <cls>
27
+ continuing_subword_prefix: ''
28
+ replacement: _
29
+ add_prefix_space: true
30
+ end_of_word_suffix: ''
31
+ fuse_unk: false
32
+ vocab_size: 32103
33
+ min_frequency: 2
34
+ limit_alphabet: 1000
35
+ initial_alphabet: []
36
+ show_progress: true