Add more configuration options to config.json5
Browse files- app-local.py +3 -1
- app-network.py +3 -1
- app-shared.py +3 -1
- app.py +35 -39
- cli.py +27 -24
- config.json5 +54 -3
- src/config.py +64 -4
app-local.py
CHANGED
@@ -1,3 +1,5 @@
|
|
1 |
# Run the app with no audio file restrictions
|
2 |
from app import create_ui
|
3 |
-
|
|
|
|
|
|
1 |
# Run the app with no audio file restrictions
|
2 |
from app import create_ui
|
3 |
+
from src.config import ApplicationConfig
|
4 |
+
|
5 |
+
create_ui(ApplicationConfig.create_default(input_audio_max_duration=-1))
|
app-network.py
CHANGED
@@ -1,3 +1,5 @@
|
|
1 |
# Run the app with no audio file restrictions, and make it available on the network
|
2 |
from app import create_ui
|
3 |
-
|
|
|
|
|
|
1 |
# Run the app with no audio file restrictions, and make it available on the network
|
2 |
from app import create_ui
|
3 |
+
from src.config import ApplicationConfig
|
4 |
+
|
5 |
+
create_ui(ApplicationConfig.create_default(input_audio_max_duration=-1, server_name="0.0.0.0"))
|
app-shared.py
CHANGED
@@ -1,3 +1,5 @@
|
|
1 |
# Run the app with no audio file restrictions
|
2 |
from app import create_ui
|
3 |
-
|
|
|
|
|
|
1 |
# Run the app with no audio file restrictions
|
2 |
from app import create_ui
|
3 |
+
from src.config import ApplicationConfig
|
4 |
+
|
5 |
+
create_ui(ApplicationConfig.create_default(input_audio_max_duration=-1, share=True))
|
app.py
CHANGED
@@ -27,11 +27,7 @@ from src.utils import slugify, write_srt, write_vtt
|
|
27 |
from src.vad import AbstractTranscription, NonSpeechStrategy, PeriodicTranscriptionConfig, TranscriptionConfig, VadPeriodicTranscription, VadSileroTranscription
|
28 |
from src.whisperContainer import WhisperContainer
|
29 |
|
30 |
-
#
|
31 |
-
DEFAULT_INPUT_AUDIO_MAX_DURATION = 600 # seconds
|
32 |
-
|
33 |
-
# Whether or not to automatically delete all uploaded files, to save disk space
|
34 |
-
DELETE_UPLOADED_FILES = True
|
35 |
|
36 |
# Gradio seems to truncate files without keeping the extension, so we need to truncate the file prefix ourself
|
37 |
MAX_FILE_PREFIX_LENGTH = 17
|
@@ -62,8 +58,8 @@ LANGUAGES = [
|
|
62 |
WHISPER_MODELS = ["tiny", "base", "small", "medium", "large", "large-v1", "large-v2"]
|
63 |
|
64 |
class WhisperTranscriber:
|
65 |
-
def __init__(self, input_audio_max_duration: float =
|
66 |
-
vad_cpu_cores: int = 1, delete_uploaded_files: bool =
|
67 |
app_config: ApplicationConfig = None):
|
68 |
self.model_cache = ModelCache()
|
69 |
self.parallel_device_list = None
|
@@ -361,15 +357,13 @@ class WhisperTranscriber:
|
|
361 |
self.cpu_parallel_context.close()
|
362 |
|
363 |
|
364 |
-
def create_ui(
|
365 |
-
|
366 |
-
|
367 |
-
output_dir: str = None, app_config: ApplicationConfig = None):
|
368 |
-
ui = WhisperTranscriber(input_audio_max_duration, vad_process_timeout, vad_cpu_cores, DELETE_UPLOADED_FILES, output_dir, app_config)
|
369 |
|
370 |
# Specify a list of devices to use for parallel processing
|
371 |
-
ui.set_parallel_devices(vad_parallel_devices)
|
372 |
-
ui.set_auto_parallel(auto_parallel)
|
373 |
|
374 |
ui_description = "Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse "
|
375 |
ui_description += " audio and is also a multi-task model that can perform multilingual speech recognition "
|
@@ -377,25 +371,25 @@ def create_ui(input_audio_max_duration, share=False, server_name: str = None, se
|
|
377 |
|
378 |
ui_description += "\n\n\n\nFor longer audio files (>10 minutes) not in English, it is recommended that you select Silero VAD (Voice Activity Detector) in the VAD option."
|
379 |
|
380 |
-
if input_audio_max_duration > 0:
|
381 |
-
ui_description += "\n\n" + "Max audio file length: " + str(input_audio_max_duration) + " s"
|
382 |
|
383 |
ui_article = "Read the [documentation here](https://gitlab.com/aadnk/whisper-webui/-/blob/main/docs/options.md)"
|
384 |
|
385 |
whisper_models = app_config.get_model_names()
|
386 |
|
387 |
simple_inputs = lambda : [
|
388 |
-
gr.Dropdown(choices=whisper_models, value=default_model_name, label="Model"),
|
389 |
-
gr.Dropdown(choices=sorted(LANGUAGES), label="Language"),
|
390 |
gr.Text(label="URL (YouTube, etc.)"),
|
391 |
gr.File(label="Upload Files", file_count="multiple"),
|
392 |
gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
|
393 |
-
gr.Dropdown(choices=["transcribe", "translate"], label="Task"),
|
394 |
-
gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], value=default_vad, label="VAD"),
|
395 |
-
gr.Number(label="VAD - Merge Window (s)", precision=0, value=
|
396 |
-
gr.Number(label="VAD - Max Merge Size (s)", precision=0, value=
|
397 |
-
gr.Number(label="VAD - Padding (s)", precision=None, value=
|
398 |
-
gr.Number(label="VAD - Prompt Window (s)", precision=None, value=
|
399 |
]
|
400 |
|
401 |
simple_transcribe = gr.Interface(fn=ui.transcribe_webui_simple, description=ui_description, article=ui_article, inputs=simple_inputs(), outputs=[
|
@@ -409,18 +403,18 @@ def create_ui(input_audio_max_duration, share=False, server_name: str = None, se
|
|
409 |
full_transcribe = gr.Interface(fn=ui.transcribe_webui_full, description=full_description, article=ui_article, inputs=[
|
410 |
*simple_inputs(),
|
411 |
gr.TextArea(label="Initial Prompt"),
|
412 |
-
gr.Number(label="Temperature", value=
|
413 |
-
gr.Number(label="Best Of - Non-zero temperature", value=
|
414 |
-
gr.Number(label="Beam Size - Zero temperature", value=
|
415 |
-
gr.Number(label="Patience - Zero temperature", value=
|
416 |
-
gr.Number(label="Length Penalty - Any temperature", value=
|
417 |
-
gr.Text(label="Suppress Tokens - Comma-separated list of token IDs", value=
|
418 |
-
gr.Checkbox(label="Condition on previous text", value=
|
419 |
-
gr.Checkbox(label="FP16", value=
|
420 |
-
gr.Number(label="Temperature increment on fallback", value=
|
421 |
-
gr.Number(label="Compression ratio threshold", value=
|
422 |
-
gr.Number(label="Logprob threshold", value
|
423 |
-
gr.Number(label="No speech threshold", value=
|
424 |
], outputs=[
|
425 |
gr.File(label="Download"),
|
426 |
gr.Text(label="Transcription"),
|
@@ -429,13 +423,13 @@ def create_ui(input_audio_max_duration, share=False, server_name: str = None, se
|
|
429 |
|
430 |
demo = gr.TabbedInterface([simple_transcribe, full_transcribe], tab_names=["Simple", "Full"])
|
431 |
|
432 |
-
demo.launch(share=share, server_name=server_name, server_port=server_port)
|
433 |
|
434 |
# Clean up
|
435 |
ui.close()
|
436 |
|
437 |
if __name__ == '__main__':
|
438 |
-
app_config = ApplicationConfig.
|
439 |
whisper_models = app_config.get_model_names()
|
440 |
|
441 |
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
@@ -463,4 +457,6 @@ if __name__ == '__main__':
|
|
463 |
help="directory to save the outputs") # None
|
464 |
|
465 |
args = parser.parse_args().__dict__
|
466 |
-
|
|
|
|
|
|
27 |
from src.vad import AbstractTranscription, NonSpeechStrategy, PeriodicTranscriptionConfig, TranscriptionConfig, VadPeriodicTranscription, VadSileroTranscription
|
28 |
from src.whisperContainer import WhisperContainer
|
29 |
|
30 |
+
# Configure more application defaults in config.json5
|
|
|
|
|
|
|
|
|
31 |
|
32 |
# Gradio seems to truncate files without keeping the extension, so we need to truncate the file prefix ourself
|
33 |
MAX_FILE_PREFIX_LENGTH = 17
|
|
|
58 |
WHISPER_MODELS = ["tiny", "base", "small", "medium", "large", "large-v1", "large-v2"]
|
59 |
|
60 |
class WhisperTranscriber:
|
61 |
+
def __init__(self, input_audio_max_duration: float = None, vad_process_timeout: float = None,
|
62 |
+
vad_cpu_cores: int = 1, delete_uploaded_files: bool = False, output_dir: str = None,
|
63 |
app_config: ApplicationConfig = None):
|
64 |
self.model_cache = ModelCache()
|
65 |
self.parallel_device_list = None
|
|
|
357 |
self.cpu_parallel_context.close()
|
358 |
|
359 |
|
360 |
+
def create_ui(app_config: ApplicationConfig):
|
361 |
+
ui = WhisperTranscriber(app_config.input_audio_max_duration, app_config.vad_process_timeout, app_config.vad_cpu_cores,
|
362 |
+
app_config.delete_uploaded_files, app_config.output_dir, app_config)
|
|
|
|
|
363 |
|
364 |
# Specify a list of devices to use for parallel processing
|
365 |
+
ui.set_parallel_devices(app_config.vad_parallel_devices)
|
366 |
+
ui.set_auto_parallel(app_config.auto_parallel)
|
367 |
|
368 |
ui_description = "Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse "
|
369 |
ui_description += " audio and is also a multi-task model that can perform multilingual speech recognition "
|
|
|
371 |
|
372 |
ui_description += "\n\n\n\nFor longer audio files (>10 minutes) not in English, it is recommended that you select Silero VAD (Voice Activity Detector) in the VAD option."
|
373 |
|
374 |
+
if app_config.input_audio_max_duration > 0:
|
375 |
+
ui_description += "\n\n" + "Max audio file length: " + str(app_config.input_audio_max_duration) + " s"
|
376 |
|
377 |
ui_article = "Read the [documentation here](https://gitlab.com/aadnk/whisper-webui/-/blob/main/docs/options.md)"
|
378 |
|
379 |
whisper_models = app_config.get_model_names()
|
380 |
|
381 |
simple_inputs = lambda : [
|
382 |
+
gr.Dropdown(choices=whisper_models, value=app_config.default_model_name, label="Model"),
|
383 |
+
gr.Dropdown(choices=sorted(LANGUAGES), label="Language", value=app_config.language),
|
384 |
gr.Text(label="URL (YouTube, etc.)"),
|
385 |
gr.File(label="Upload Files", file_count="multiple"),
|
386 |
gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
|
387 |
+
gr.Dropdown(choices=["transcribe", "translate"], label="Task", value=app_config.task),
|
388 |
+
gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], value=app_config.default_vad, label="VAD"),
|
389 |
+
gr.Number(label="VAD - Merge Window (s)", precision=0, value=app_config.vad_merge_window),
|
390 |
+
gr.Number(label="VAD - Max Merge Size (s)", precision=0, value=app_config.vad_max_merge_size),
|
391 |
+
gr.Number(label="VAD - Padding (s)", precision=None, value=app_config.vad_padding),
|
392 |
+
gr.Number(label="VAD - Prompt Window (s)", precision=None, value=app_config.vad_prompt_window),
|
393 |
]
|
394 |
|
395 |
simple_transcribe = gr.Interface(fn=ui.transcribe_webui_simple, description=ui_description, article=ui_article, inputs=simple_inputs(), outputs=[
|
|
|
403 |
full_transcribe = gr.Interface(fn=ui.transcribe_webui_full, description=full_description, article=ui_article, inputs=[
|
404 |
*simple_inputs(),
|
405 |
gr.TextArea(label="Initial Prompt"),
|
406 |
+
gr.Number(label="Temperature", value=app_config.temperature),
|
407 |
+
gr.Number(label="Best Of - Non-zero temperature", value=app_config.best_of, precision=0),
|
408 |
+
gr.Number(label="Beam Size - Zero temperature", value=app_config.beam_size, precision=0),
|
409 |
+
gr.Number(label="Patience - Zero temperature", value=app_config.patience),
|
410 |
+
gr.Number(label="Length Penalty - Any temperature", value=app_config.length_penalty),
|
411 |
+
gr.Text(label="Suppress Tokens - Comma-separated list of token IDs", value=app_config.suppress_tokens),
|
412 |
+
gr.Checkbox(label="Condition on previous text", value=app_config.condition_on_previous_text),
|
413 |
+
gr.Checkbox(label="FP16", value=app_config.fp16),
|
414 |
+
gr.Number(label="Temperature increment on fallback", value=app_config.temperature_increment_on_fallback),
|
415 |
+
gr.Number(label="Compression ratio threshold", value=app_config.compression_ratio_threshold),
|
416 |
+
gr.Number(label="Logprob threshold", value=app_config.logprob_threshold),
|
417 |
+
gr.Number(label="No speech threshold", value=app_config.no_speech_threshold)
|
418 |
], outputs=[
|
419 |
gr.File(label="Download"),
|
420 |
gr.Text(label="Transcription"),
|
|
|
423 |
|
424 |
demo = gr.TabbedInterface([simple_transcribe, full_transcribe], tab_names=["Simple", "Full"])
|
425 |
|
426 |
+
demo.launch(share=app_config.share, server_name=app_config.server_name, server_port=app_config.server_port)
|
427 |
|
428 |
# Clean up
|
429 |
ui.close()
|
430 |
|
431 |
if __name__ == '__main__':
|
432 |
+
app_config = ApplicationConfig.create_default()
|
433 |
whisper_models = app_config.get_model_names()
|
434 |
|
435 |
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
|
|
457 |
help="directory to save the outputs") # None
|
458 |
|
459 |
args = parser.parse_args().__dict__
|
460 |
+
|
461 |
+
updated_config = app_config.update(**args)
|
462 |
+
create_ui(app_config=updated_config)
|
cli.py
CHANGED
@@ -14,37 +14,40 @@ from src.utils import optional_float, optional_int, str2bool
|
|
14 |
from src.whisperContainer import WhisperContainer
|
15 |
|
16 |
def cli():
|
17 |
-
app_config = ApplicationConfig.
|
18 |
whisper_models = app_config.get_model_names()
|
19 |
|
|
|
|
|
|
|
20 |
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
21 |
parser.add_argument("audio", nargs="+", type=str, \
|
22 |
help="audio file(s) to transcribe")
|
23 |
parser.add_argument("--model", default=app_config.default_model_name, choices=whisper_models, \
|
24 |
help="name of the Whisper model to use") # medium
|
25 |
-
parser.add_argument("--model_dir", type=str, default=
|
26 |
help="the path to save model files; uses ~/.cache/whisper by default")
|
27 |
-
parser.add_argument("--device", default=
|
28 |
help="device to use for PyTorch inference")
|
29 |
-
parser.add_argument("--output_dir", "-o", type=str, default=
|
30 |
help="directory to save the outputs")
|
31 |
-
parser.add_argument("--verbose", type=str2bool, default=
|
32 |
help="whether to print out the progress and debug messages")
|
33 |
|
34 |
-
parser.add_argument("--task", type=str, default=
|
35 |
help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')")
|
36 |
-
parser.add_argument("--language", type=str, default=
|
37 |
help="language spoken in the audio, specify None to perform language detection")
|
38 |
|
39 |
parser.add_argument("--vad", type=str, default=app_config.default_vad, choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], \
|
40 |
help="The voice activity detection algorithm to use") # silero-vad
|
41 |
-
parser.add_argument("--vad_merge_window", type=optional_float, default=
|
42 |
help="The window size (in seconds) to merge voice segments")
|
43 |
-
parser.add_argument("--vad_max_merge_size", type=optional_float, default=
|
44 |
help="The maximum size (in seconds) of a voice segment")
|
45 |
-
parser.add_argument("--vad_padding", type=optional_float, default=
|
46 |
help="The padding (in seconds) to add to each voice segment")
|
47 |
-
parser.add_argument("--vad_prompt_window", type=optional_float, default=
|
48 |
help="The window size of the prompt to pass to Whisper")
|
49 |
parser.add_argument("--vad_cpu_cores", type=int, default=app_config.vad_cpu_cores, \
|
50 |
help="The number of CPU cores to use for VAD pre-processing.") # 1
|
@@ -53,33 +56,33 @@ def cli():
|
|
53 |
parser.add_argument("--auto_parallel", type=bool, default=app_config.auto_parallel, \
|
54 |
help="True to use all available GPUs and CPU cores for processing. Use vad_cpu_cores/vad_parallel_devices to specify the number of CPU cores/GPUs to use.") # False
|
55 |
|
56 |
-
parser.add_argument("--temperature", type=float, default=
|
57 |
help="temperature to use for sampling")
|
58 |
-
parser.add_argument("--best_of", type=optional_int, default=
|
59 |
help="number of candidates when sampling with non-zero temperature")
|
60 |
-
parser.add_argument("--beam_size", type=optional_int, default=
|
61 |
help="number of beams in beam search, only applicable when temperature is zero")
|
62 |
-
parser.add_argument("--patience", type=float, default=
|
63 |
help="optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search")
|
64 |
-
parser.add_argument("--length_penalty", type=float, default=
|
65 |
help="optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple lengt normalization by default")
|
66 |
|
67 |
-
parser.add_argument("--suppress_tokens", type=str, default=
|
68 |
help="comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations")
|
69 |
-
parser.add_argument("--initial_prompt", type=str, default=
|
70 |
help="optional text to provide as a prompt for the first window.")
|
71 |
-
parser.add_argument("--condition_on_previous_text", type=str2bool, default=
|
72 |
help="if True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop")
|
73 |
-
parser.add_argument("--fp16", type=str2bool, default=
|
74 |
help="whether to perform inference in fp16; True by default")
|
75 |
|
76 |
-
parser.add_argument("--temperature_increment_on_fallback", type=optional_float, default=
|
77 |
help="temperature to increase when falling back when the decoding fails to meet either of the thresholds below")
|
78 |
-
parser.add_argument("--compression_ratio_threshold", type=optional_float, default=
|
79 |
help="if the gzip compression ratio is higher than this value, treat the decoding as failed")
|
80 |
-
parser.add_argument("--logprob_threshold", type=optional_float, default
|
81 |
help="if the average log probability is lower than this value, treat the decoding as failed")
|
82 |
-
parser.add_argument("--no_speech_threshold", type=optional_float, default=
|
83 |
help="if the probability of the <|nospeech|> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence")
|
84 |
|
85 |
args = parser.parse_args().__dict__
|
|
|
14 |
from src.whisperContainer import WhisperContainer
|
15 |
|
16 |
def cli():
|
17 |
+
app_config = ApplicationConfig.create_default()
|
18 |
whisper_models = app_config.get_model_names()
|
19 |
|
20 |
+
# For the CLI, we fallback to saving the output to the current directory
|
21 |
+
output_dir = app_config.output_dir if app_config.output_dir is not None else "."
|
22 |
+
|
23 |
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
24 |
parser.add_argument("audio", nargs="+", type=str, \
|
25 |
help="audio file(s) to transcribe")
|
26 |
parser.add_argument("--model", default=app_config.default_model_name, choices=whisper_models, \
|
27 |
help="name of the Whisper model to use") # medium
|
28 |
+
parser.add_argument("--model_dir", type=str, default=app_config.model_dir, \
|
29 |
help="the path to save model files; uses ~/.cache/whisper by default")
|
30 |
+
parser.add_argument("--device", default=app_config.device, \
|
31 |
help="device to use for PyTorch inference")
|
32 |
+
parser.add_argument("--output_dir", "-o", type=str, default=output_dir, \
|
33 |
help="directory to save the outputs")
|
34 |
+
parser.add_argument("--verbose", type=str2bool, default=app_config.verbose, \
|
35 |
help="whether to print out the progress and debug messages")
|
36 |
|
37 |
+
parser.add_argument("--task", type=str, default=app_config.task, choices=["transcribe", "translate"], \
|
38 |
help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')")
|
39 |
+
parser.add_argument("--language", type=str, default=app_config.language, choices=sorted(LANGUAGES), \
|
40 |
help="language spoken in the audio, specify None to perform language detection")
|
41 |
|
42 |
parser.add_argument("--vad", type=str, default=app_config.default_vad, choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], \
|
43 |
help="The voice activity detection algorithm to use") # silero-vad
|
44 |
+
parser.add_argument("--vad_merge_window", type=optional_float, default=app_config.vad_merge_window, \
|
45 |
help="The window size (in seconds) to merge voice segments")
|
46 |
+
parser.add_argument("--vad_max_merge_size", type=optional_float, default=app_config.vad_max_merge_size,\
|
47 |
help="The maximum size (in seconds) of a voice segment")
|
48 |
+
parser.add_argument("--vad_padding", type=optional_float, default=app_config.vad_padding, \
|
49 |
help="The padding (in seconds) to add to each voice segment")
|
50 |
+
parser.add_argument("--vad_prompt_window", type=optional_float, default=app_config.vad_prompt_window, \
|
51 |
help="The window size of the prompt to pass to Whisper")
|
52 |
parser.add_argument("--vad_cpu_cores", type=int, default=app_config.vad_cpu_cores, \
|
53 |
help="The number of CPU cores to use for VAD pre-processing.") # 1
|
|
|
56 |
parser.add_argument("--auto_parallel", type=bool, default=app_config.auto_parallel, \
|
57 |
help="True to use all available GPUs and CPU cores for processing. Use vad_cpu_cores/vad_parallel_devices to specify the number of CPU cores/GPUs to use.") # False
|
58 |
|
59 |
+
parser.add_argument("--temperature", type=float, default=app_config.temperature, \
|
60 |
help="temperature to use for sampling")
|
61 |
+
parser.add_argument("--best_of", type=optional_int, default=app_config.best_of, \
|
62 |
help="number of candidates when sampling with non-zero temperature")
|
63 |
+
parser.add_argument("--beam_size", type=optional_int, default=app_config.beam_size, \
|
64 |
help="number of beams in beam search, only applicable when temperature is zero")
|
65 |
+
parser.add_argument("--patience", type=float, default=app_config.patience, \
|
66 |
help="optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search")
|
67 |
+
parser.add_argument("--length_penalty", type=float, default=app_config.length_penalty, \
|
68 |
help="optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple lengt normalization by default")
|
69 |
|
70 |
+
parser.add_argument("--suppress_tokens", type=str, default=app_config.suppress_tokens, \
|
71 |
help="comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations")
|
72 |
+
parser.add_argument("--initial_prompt", type=str, default=app_config.initial_prompt, \
|
73 |
help="optional text to provide as a prompt for the first window.")
|
74 |
+
parser.add_argument("--condition_on_previous_text", type=str2bool, default=app_config.condition_on_previous_text, \
|
75 |
help="if True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop")
|
76 |
+
parser.add_argument("--fp16", type=str2bool, default=app_config.fp16, \
|
77 |
help="whether to perform inference in fp16; True by default")
|
78 |
|
79 |
+
parser.add_argument("--temperature_increment_on_fallback", type=optional_float, default=app_config.temperature_increment_on_fallback, \
|
80 |
help="temperature to increase when falling back when the decoding fails to meet either of the thresholds below")
|
81 |
+
parser.add_argument("--compression_ratio_threshold", type=optional_float, default=app_config.compression_ratio_threshold, \
|
82 |
help="if the gzip compression ratio is higher than this value, treat the decoding as failed")
|
83 |
+
parser.add_argument("--logprob_threshold", type=optional_float, default=app_config.logprob_threshold, \
|
84 |
help="if the average log probability is lower than this value, treat the decoding as failed")
|
85 |
+
parser.add_argument("--no_speech_threshold", type=optional_float, default=app_config.no_speech_threshold, \
|
86 |
help="if the probability of the <|nospeech|> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence")
|
87 |
|
88 |
args = parser.parse_args().__dict__
|
config.json5
CHANGED
@@ -45,7 +45,9 @@
|
|
45 |
],
|
46 |
// Configuration options that will be used if they are not specified in the command line arguments.
|
47 |
|
48 |
-
//
|
|
|
|
|
49 |
"input_audio_max_duration": 600,
|
50 |
// True to share the app on HuggingFace.
|
51 |
"share": false,
|
@@ -53,6 +55,11 @@
|
|
53 |
"server_name": null,
|
54 |
// The port to bind to.
|
55 |
"server_port": 7860,
|
|
|
|
|
|
|
|
|
|
|
56 |
// The default model name.
|
57 |
"default_model_name": "medium",
|
58 |
// The default VAD.
|
@@ -65,6 +72,50 @@
|
|
65 |
"vad_process_timeout": 1800,
|
66 |
// True to use all available GPUs and CPU cores for processing. Use vad_cpu_cores/vad_parallel_devices to specify the number of CPU cores/GPUs to use.
|
67 |
"auto_parallel": false,
|
68 |
-
// Directory to save the outputs
|
69 |
-
"output_dir": null
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
}
|
|
|
45 |
],
|
46 |
// Configuration options that will be used if they are not specified in the command line arguments.
|
47 |
|
48 |
+
// * WEBUI options *
|
49 |
+
|
50 |
+
// Maximum audio file length in seconds, or -1 for no limit. Ignored by CLI.
|
51 |
"input_audio_max_duration": 600,
|
52 |
// True to share the app on HuggingFace.
|
53 |
"share": false,
|
|
|
55 |
"server_name": null,
|
56 |
// The port to bind to.
|
57 |
"server_port": 7860,
|
58 |
+
// Whether or not to automatically delete all uploaded files, to save disk space
|
59 |
+
"delete_uploaded_files": true,
|
60 |
+
|
61 |
+
// * General options *
|
62 |
+
|
63 |
// The default model name.
|
64 |
"default_model_name": "medium",
|
65 |
// The default VAD.
|
|
|
72 |
"vad_process_timeout": 1800,
|
73 |
// True to use all available GPUs and CPU cores for processing. Use vad_cpu_cores/vad_parallel_devices to specify the number of CPU cores/GPUs to use.
|
74 |
"auto_parallel": false,
|
75 |
+
// Directory to save the outputs (CLI will use the current directory if not specified)
|
76 |
+
"output_dir": null,
|
77 |
+
// The path to save model files; uses ~/.cache/whisper by default
|
78 |
+
"model_dir": null,
|
79 |
+
// Device to use for PyTorch inference, or Null to use the default device
|
80 |
+
"device": null,
|
81 |
+
// Whether to print out the progress and debug messages
|
82 |
+
"verbose": true,
|
83 |
+
// Whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')
|
84 |
+
"task": "transcribe",
|
85 |
+
// Language spoken in the audio, specify None to perform language detection
|
86 |
+
"language": null,
|
87 |
+
// The window size (in seconds) to merge voice segments
|
88 |
+
"vad_merge_window": 5,
|
89 |
+
// The maximum size (in seconds) of a voice segment
|
90 |
+
"vad_max_merge_size": 30,
|
91 |
+
// The padding (in seconds) to add to each voice segment
|
92 |
+
"vad_padding": 1,
|
93 |
+
// The window size of the prompt to pass to Whisper
|
94 |
+
"vad_prompt_window": 3,
|
95 |
+
// Temperature to use for sampling
|
96 |
+
"temperature": 0,
|
97 |
+
// Number of candidates when sampling with non-zero temperature
|
98 |
+
"best_of": 5,
|
99 |
+
// Number of beams in beam search, only applicable when temperature is zero
|
100 |
+
"beam_size": 5,
|
101 |
+
// Optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search
|
102 |
+
"patience": null,
|
103 |
+
// Optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple length normalization by default
|
104 |
+
"length_penalty": null,
|
105 |
+
// Comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations
|
106 |
+
"suppress_tokens": "-1",
|
107 |
+
// Optional text to provide as a prompt for the first window
|
108 |
+
"initial_prompt": null,
|
109 |
+
// If True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop
|
110 |
+
"condition_on_previous_text": true,
|
111 |
+
// Whether to perform inference in fp16; True by default
|
112 |
+
"fp16": true,
|
113 |
+
// Temperature to increase when falling back when the decoding fails to meet either of the thresholds below
|
114 |
+
"temperature_increment_on_fallback": 0.2,
|
115 |
+
// If the gzip compression ratio is higher than this value, treat the decoding as failed
|
116 |
+
"compression_ratio_threshold": 2.4,
|
117 |
+
// If the average log probability is lower than this value, treat the decoding as failed
|
118 |
+
"logprob_threshold": -1.0,
|
119 |
+
// If the probability of the <no-speech> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence
|
120 |
+
"no_speech_threshold": 0.6
|
121 |
}
|
src/config.py
CHANGED
@@ -3,6 +3,8 @@ import urllib
|
|
3 |
import os
|
4 |
from typing import List
|
5 |
from urllib.parse import urlparse
|
|
|
|
|
6 |
|
7 |
from tqdm import tqdm
|
8 |
|
@@ -101,14 +103,33 @@ class ModelConfig:
|
|
101 |
|
102 |
class ApplicationConfig:
|
103 |
def __init__(self, models: List[ModelConfig] = [], input_audio_max_duration: int = 600,
|
104 |
-
share: bool = False, server_name: str = None, server_port: int = 7860,
|
105 |
-
|
106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
self.models = models
|
|
|
|
|
108 |
self.input_audio_max_duration = input_audio_max_duration
|
109 |
self.share = share
|
110 |
self.server_name = server_name
|
111 |
self.server_port = server_port
|
|
|
|
|
112 |
self.default_model_name = default_model_name
|
113 |
self.default_vad = default_vad
|
114 |
self.vad_parallel_devices = vad_parallel_devices
|
@@ -117,9 +138,48 @@ class ApplicationConfig:
|
|
117 |
self.auto_parallel = auto_parallel
|
118 |
self.output_dir = output_dir
|
119 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
def get_model_names(self):
|
121 |
return [ x.name for x in self.models ]
|
122 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
@staticmethod
|
124 |
def parse_file(config_path: str):
|
125 |
import json5
|
@@ -131,4 +191,4 @@ class ApplicationConfig:
|
|
131 |
|
132 |
models = [ ModelConfig(**x) for x in data_models ]
|
133 |
|
134 |
-
return ApplicationConfig(models, **data)
|
|
|
3 |
import os
|
4 |
from typing import List
|
5 |
from urllib.parse import urlparse
|
6 |
+
import json5
|
7 |
+
import torch
|
8 |
|
9 |
from tqdm import tqdm
|
10 |
|
|
|
103 |
|
104 |
class ApplicationConfig:
|
105 |
def __init__(self, models: List[ModelConfig] = [], input_audio_max_duration: int = 600,
|
106 |
+
share: bool = False, server_name: str = None, server_port: int = 7860, delete_uploaded_files: bool = True,
|
107 |
+
default_model_name: str = "medium", default_vad: str = "silero-vad",
|
108 |
+
vad_parallel_devices: str = "", vad_cpu_cores: int = 1, vad_process_timeout: int = 1800,
|
109 |
+
auto_parallel: bool = False, output_dir: str = None,
|
110 |
+
model_dir: str = None, device: str = None,
|
111 |
+
verbose: bool = True, task: str = "transcribe", language: str = None,
|
112 |
+
vad_merge_window: float = 5, vad_max_merge_size: float = 30,
|
113 |
+
vad_padding: float = 1, vad_prompt_window: float = 3,
|
114 |
+
temperature: float = 0, best_of: int = 5, beam_size: int = 5,
|
115 |
+
patience: float = None, length_penalty: float = None,
|
116 |
+
suppress_tokens: str = "-1", initial_prompt: str = None,
|
117 |
+
condition_on_previous_text: bool = True, fp16: bool = True,
|
118 |
+
temperature_increment_on_fallback: float = 0.2, compression_ratio_threshold: float = 2.4,
|
119 |
+
logprob_threshold: float = -1.0, no_speech_threshold: float = 0.6):
|
120 |
+
|
121 |
+
if device is None:
|
122 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
123 |
+
|
124 |
self.models = models
|
125 |
+
|
126 |
+
# WebUI settings
|
127 |
self.input_audio_max_duration = input_audio_max_duration
|
128 |
self.share = share
|
129 |
self.server_name = server_name
|
130 |
self.server_port = server_port
|
131 |
+
self.delete_uploaded_files = delete_uploaded_files
|
132 |
+
|
133 |
self.default_model_name = default_model_name
|
134 |
self.default_vad = default_vad
|
135 |
self.vad_parallel_devices = vad_parallel_devices
|
|
|
138 |
self.auto_parallel = auto_parallel
|
139 |
self.output_dir = output_dir
|
140 |
|
141 |
+
self.model_dir = model_dir
|
142 |
+
self.device = device
|
143 |
+
self.verbose = verbose
|
144 |
+
self.task = task
|
145 |
+
self.language = language
|
146 |
+
self.vad_merge_window = vad_merge_window
|
147 |
+
self.vad_max_merge_size = vad_max_merge_size
|
148 |
+
self.vad_padding = vad_padding
|
149 |
+
self.vad_prompt_window = vad_prompt_window
|
150 |
+
self.temperature = temperature
|
151 |
+
self.best_of = best_of
|
152 |
+
self.beam_size = beam_size
|
153 |
+
self.patience = patience
|
154 |
+
self.length_penalty = length_penalty
|
155 |
+
self.suppress_tokens = suppress_tokens
|
156 |
+
self.initial_prompt = initial_prompt
|
157 |
+
self.condition_on_previous_text = condition_on_previous_text
|
158 |
+
self.fp16 = fp16
|
159 |
+
self.temperature_increment_on_fallback = temperature_increment_on_fallback
|
160 |
+
self.compression_ratio_threshold = compression_ratio_threshold
|
161 |
+
self.logprob_threshold = logprob_threshold
|
162 |
+
self.no_speech_threshold = no_speech_threshold
|
163 |
+
|
164 |
def get_model_names(self):
|
165 |
return [ x.name for x in self.models ]
|
166 |
|
167 |
+
def update(self, **new_values):
|
168 |
+
result = ApplicationConfig(**self.__dict__)
|
169 |
+
|
170 |
+
for key, value in new_values.items():
|
171 |
+
setattr(result, key, value)
|
172 |
+
return result
|
173 |
+
|
174 |
+
@staticmethod
|
175 |
+
def create_default(**kwargs):
|
176 |
+
app_config = ApplicationConfig.parse_file(os.environ.get("WHISPER_WEBUI_CONFIG", "config.json5"))
|
177 |
+
|
178 |
+
# Update with kwargs
|
179 |
+
if len(kwargs) > 0:
|
180 |
+
app_config = app_config.update(**kwargs)
|
181 |
+
return app_config
|
182 |
+
|
183 |
@staticmethod
|
184 |
def parse_file(config_path: str):
|
185 |
import json5
|
|
|
191 |
|
192 |
models = [ ModelConfig(**x) for x in data_models ]
|
193 |
|
194 |
+
return ApplicationConfig(models, **data)
|