arxyzan commited on
Commit
9fe7716
1 Parent(s): 56b4e3c

Upload preprocessor with huggingface_hub

Browse files
preprocessor/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
preprocessor/tokenizer_config.yaml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: bpe_tokenizer
2
+ config_type: preprocessor
3
+ truncation_strategy: no_truncation
4
+ padding_strategy: no_padding
5
+ continuing_subword_prefix: ''
6
+ end_of_word_suffix: ''
7
+ fuse_unk: false
8
+ train_config:
9
+ name: bpe_tokenizer
10
+ config_type: preprocessor
11
+ vocab_size: 30000
12
+ min_frequency: 2
13
+ limit_alphabet: 1000
14
+ show_progress: true