whisper-small-ru-pruned-ft / tokenizer_config.json
waveletdeboshir's picture
Upload model, tokenizer, preprocessor
9104ccf verified
{
"add_bos_token": false,
"add_prefix_space": false,
"added_tokens_decoder": {
"4197": {
"content": "<|endoftext|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"4198": {
"content": "<|startoftranscript|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"4199": {
"content": "<|en|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"4200": {
"content": "<|ru|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"4201": {
"content": "<|translate|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"4202": {
"content": "<|transcribe|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"4203": {
"content": "<|startoflm|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"4204": {
"content": "<|startofprev|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"4205": {
"content": "<|nocaptions|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"4206": {
"content": "<|notimestamps|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
}
},
"additional_special_tokens": [
"<|endoftext|>",
"<|startoftranscript|>",
"<|en|>",
"<|ru|>",
"<|translate|>",
"<|transcribe|>",
"<|startoflm|>",
"<|startofprev|>",
"<|nocaptions|>",
"<|notimestamps|>"
],
"bos_token": "<|endoftext|>",
"clean_up_tokenization_spaces": true,
"eos_token": "<|endoftext|>",
"errors": "replace",
"model_max_length": 1024,
"pad_token": "<|endoftext|>",
"processor_class": "WhisperProcessor",
"return_attention_mask": false,
"tokenizer_class": "WhisperTokenizer",
"unk_token": "<|endoftext|>"
}