arxyzan commited on
Commit
dd73fd7
1 Parent(s): ffda8df

Hezar: Upload tokenizer_config.yaml

Browse files
Files changed (1) hide show
  1. preprocessor/tokenizer_config.yaml +6 -15
preprocessor/tokenizer_config.yaml CHANGED
@@ -1,6 +1,5 @@
1
  name: sentencepiece_unigram_tokenizer
2
  config_type: preprocessor
3
- pretrained_path: t5-base-fa
4
  max_length: 512
5
  truncation_strategy: longest_first
6
  truncation_direction: right
@@ -8,22 +7,14 @@ stride: 0
8
  padding_strategy: longest
9
  padding_direction: right
10
  pad_to_multiple_of: 0
11
- pad_token_id: 0
12
- pad_token: <pad>
13
  pad_token_type_id: 0
 
 
14
  unk_token: <unk>
15
- special_tokens:
16
- - <s>
17
- - <pad>
18
- - </s>
19
- - <unk>
20
- - <mask>
21
- - <|endoftext|>
22
- - <|startoftext|>
23
- - <nl>
24
- - <hs>
25
- - <sep>
26
- - <cls>
27
  continuing_subword_prefix: ''
28
  replacement: _
29
  add_prefix_space: true
 
1
  name: sentencepiece_unigram_tokenizer
2
  config_type: preprocessor
 
3
  max_length: 512
4
  truncation_strategy: longest_first
5
  truncation_direction: right
 
7
  padding_strategy: longest
8
  padding_direction: right
9
  pad_to_multiple_of: 0
 
 
10
  pad_token_type_id: 0
11
+ bos_token: <s>
12
+ eos_token: </s>
13
  unk_token: <unk>
14
+ sep_token: <sep>
15
+ pad_token: <pad>
16
+ cls_token: <cls>
17
+ mask_token: <mask>
 
 
 
 
 
 
 
 
18
  continuing_subword_prefix: ''
19
  replacement: _
20
  add_prefix_space: true