hezarai
/

roberta-base-fa

arxyzan commited on Oct 1, 2023

Commit

8e71a11

•

1 Parent(s): ba6768b

Hezar: Upload tokenizer_config.yaml

Files changed (1) hide show

preprocessor/tokenizer_config.yaml CHANGED Viewed

@@ -1,4 +1,4 @@
-name: bpe_tokenizer
 config_type: preprocessor
 max_length: 512
 truncation_strategy: longest_first
@@ -8,16 +8,12 @@ padding_strategy: longest
 padding_direction: right
 pad_to_multiple_of: 0
 pad_token_type_id: 0
-bos_token: <s>
-eos_token: </s>
-unk_token: <unk>
-sep_token: <sep>
-pad_token: <pad>
-cls_token: <cls>
-mask_token: <mask>
-continuing_subword_prefix: ''
-end_of_word_suffix: ''
-fuse_unk: false
 vocab_size: 42000
 min_frequency: 2
 limit_alphabet: 1000

+name: wordpiece_tokenizer
 config_type: preprocessor
 max_length: 512
 truncation_strategy: longest_first
 padding_direction: right
 pad_to_multiple_of: 0
 pad_token_type_id: 0
+unk_token: '[UNK]'
+sep_token: '[SEP]'
+pad_token: '[PAD]'
+cls_token: '[CLS]'
+mask_token: '[MASK]'
+wordpieces_prefix: '##'
 vocab_size: 42000
 min_frequency: 2
 limit_alphabet: 1000