File size: 596 Bytes
d57bcf7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
name: wordpiece_tokenizer
config_type: preprocessor
pretrained_path: hezar-ai/bert-base-fa
max_length: 512
truncation_strategy: longest_first
truncation_direction: right
stride: 0
padding_strategy: longest
padding_direction: right
pad_to_multiple_of: 0
pad_token_id: 0
pad_token: '[PAD]'
pad_token_type_id: 0
unk_token: '[UNK]'
special_tokens:
- '[UNK]'
- '[SEP]'
- '[CLS]'
- '[PAD]'
- '[MASK]'
wordpieces_prefix: '##'
train_config:
  name: wordpiece_tokenizer
  config_type: preprocessor
  vocab_size: 30000
  min_frequency: 2
  limit_alphabet: 1000
  initial_alphabet: []
  show_progress: true