arxyzan commited on
Commit
54b8211
1 Parent(s): e48c764

Hezar: Upload tokenizer and config

Browse files
Files changed (1) hide show
  1. preprocessor/tokenizer_config.yaml +11 -10
preprocessor/tokenizer_config.yaml CHANGED
@@ -1,18 +1,21 @@
1
  name: whisper_bpe_tokenizer
2
  config_type: preprocessor
3
- pretrained_path: whisper-small-fa
4
- max_length: 512
5
  truncation_strategy: longest_first
6
  truncation_direction: right
7
  stride: 0
8
  padding_strategy: longest
9
  padding_direction: right
10
  pad_to_multiple_of: 0
11
- pad_token_id: 0
12
- pad_token: <pad>
13
  pad_token_type_id: 0
 
 
14
  unk_token: <|endoftext|>
15
- special_tokens:
 
 
 
 
16
  - <|endoftext|>
17
  - <|endoftext|>
18
  - <|startoftranscript|>
@@ -129,11 +132,9 @@ min_frequency: 2
129
  limit_alphabet: 1000
130
  initial_alphabet: []
131
  show_progress: true
132
- unk_token_id: 50257
133
- bos_token: <|startoftranscript|>
134
- bos_token_id: 50257
135
- eos_token: <|endoftext|>
136
- eos_token_id: 50257
137
  add_prefix_space: false
138
  add_bos_token: false
139
  model_max_length: 1024
 
1
  name: whisper_bpe_tokenizer
2
  config_type: preprocessor
3
+ max_length: 448
 
4
  truncation_strategy: longest_first
5
  truncation_direction: right
6
  stride: 0
7
  padding_strategy: longest
8
  padding_direction: right
9
  pad_to_multiple_of: 0
 
 
10
  pad_token_type_id: 0
11
+ bos_token: <|endoftext|>
12
+ eos_token: <|endoftext|>
13
  unk_token: <|endoftext|>
14
+ sep_token: <sep>
15
+ pad_token: <|endoftext|>
16
+ cls_token: <cls>
17
+ mask_token: <mask>
18
+ additional_special_tokens:
19
  - <|endoftext|>
20
  - <|endoftext|>
21
  - <|startoftranscript|>
 
132
  limit_alphabet: 1000
133
  initial_alphabet: []
134
  show_progress: true
135
+ translate_token: <|translate|>
136
+ transcribe_token: <|transcribe|>
137
+ notimestamps_token: <|notimestamps|>
 
 
138
  add_prefix_space: false
139
  add_bos_token: false
140
  model_max_length: 1024