hezarai
/

t5-base-fa

arxyzan commited on Oct 1, 2023

Commit

dd73fd7

•

1 Parent(s): ffda8df

Hezar: Upload tokenizer_config.yaml

Files changed (1) hide show

preprocessor/tokenizer_config.yaml CHANGED Viewed

@@ -1,6 +1,5 @@
 name: sentencepiece_unigram_tokenizer
 config_type: preprocessor
-pretrained_path: t5-base-fa
 max_length: 512
 truncation_strategy: longest_first
 truncation_direction: right
@@ -8,22 +7,14 @@ stride: 0
 padding_strategy: longest
 padding_direction: right
 pad_to_multiple_of: 0
-pad_token_id: 0
-pad_token: <pad>
 pad_token_type_id: 0
 unk_token: <unk>
-special_tokens:
-- <s>
-- <pad>
-- </s>
-- <unk>
-- <mask>
-- <|endoftext|>
-- <|startoftext|>
-- <nl>
-- <hs>
-- <sep>
-- <cls>
 continuing_subword_prefix: ''
 replacement: _
 add_prefix_space: true

 name: sentencepiece_unigram_tokenizer
 config_type: preprocessor
 max_length: 512
 truncation_strategy: longest_first
 truncation_direction: right
 padding_strategy: longest
 padding_direction: right
 pad_to_multiple_of: 0
 pad_token_type_id: 0
+bos_token: <s>
+eos_token: </s>
 unk_token: <unk>
+sep_token: <sep>
+pad_token: <pad>
+cls_token: <cls>
+mask_token: <mask>
 continuing_subword_prefix: ''
 replacement: _
 add_prefix_space: true