|
name: wordpiece_tokenizer |
|
config_type: preprocessor |
|
pretrained_path: hezar-ai/bert-base-fa |
|
max_length: 512 |
|
truncation_strategy: longest_first |
|
truncation_direction: right |
|
stride: 0 |
|
padding_strategy: longest |
|
padding_direction: right |
|
pad_to_multiple_of: 0 |
|
pad_token_id: 0 |
|
pad_token: '[PAD]' |
|
pad_token_type_id: 0 |
|
unk_token: '[UNK]' |
|
special_tokens: |
|
- '[UNK]' |
|
- '[SEP]' |
|
- '[CLS]' |
|
- '[PAD]' |
|
- '[MASK]' |
|
wordpieces_prefix: '##' |
|
train_config: |
|
name: wordpiece_tokenizer |
|
config_type: preprocessor |
|
vocab_size: 30000 |
|
min_frequency: 2 |
|
limit_alphabet: 1000 |
|
initial_alphabet: [] |
|
show_progress: true |
|
|