|
name: bpe_tokenizer |
|
config_type: preprocessor |
|
pretrained_path: hezarai/roberta-base-fa |
|
max_length: 512 |
|
truncation_strategy: longest_first |
|
truncation_direction: right |
|
stride: 0 |
|
padding_strategy: longest |
|
padding_direction: right |
|
pad_to_multiple_of: 0 |
|
pad_token_id: 0 |
|
pad_token: <pad> |
|
pad_token_type_id: 0 |
|
unk_token: <unk> |
|
special_tokens: |
|
- <s> |
|
- <pad> |
|
- </s> |
|
- <unk> |
|
- <mask> |
|
- <|endoftext|> |
|
- <|startoftext|> |
|
- <nl> |
|
- <hs> |
|
- <sep> |
|
- <cls> |
|
continuing_subword_prefix: '' |
|
end_of_word_suffix: '' |
|
fuse_unk: false |
|
train_config: |
|
name: bpe_tokenizer |
|
config_type: preprocessor |
|
vocab_size: 30000 |
|
min_frequency: 2 |
|
limit_alphabet: 1000 |
|
show_progress: true |
|
|