Make speech padding configurable.
Browse filesUnfortunately, it seems that zero padding is not entirely
desirable either, as it increases the probability of a mistake
in between each speech section passed to Whisper.
Thus, we set the default to 1, but we leave it up to the user
to configure it to zero or some other value depending
on their preference:
0: Better timestamps, but more transcription mistakes.
1: Worse timestamps, but more accurate transcription
in between each speech section.
- app.py +7 -4
- docs/options.md +7 -1
- src/vad.py +5 -5
app.py
CHANGED
@@ -52,7 +52,7 @@ class UI:
|
|
52 |
self.vad_model = None
|
53 |
self.inputAudioMaxDuration = inputAudioMaxDuration
|
54 |
|
55 |
-
def transcribeFile(self, modelName, languageName, urlData, uploadFile, microphoneData, task, vad, vadMergeWindow, vadMaxMergeSize):
|
56 |
try:
|
57 |
source, sourceName = self.getSource(urlData, uploadFile, microphoneData)
|
58 |
|
@@ -76,7 +76,8 @@ class UI:
|
|
76 |
self.vad_model = VadSileroTranscription()
|
77 |
|
78 |
process_gaps = VadSileroTranscription(transcribe_non_speech = True,
|
79 |
-
max_silent_period=vadMergeWindow, max_merge_size=vadMaxMergeSize,
|
|
|
80 |
result = process_gaps.transcribe(source, whisperCallable)
|
81 |
elif (vad == 'silero-vad-skip-gaps'):
|
82 |
# Use Silero VAD
|
@@ -84,7 +85,8 @@ class UI:
|
|
84 |
self.vad_model = VadSileroTranscription()
|
85 |
|
86 |
skip_gaps = VadSileroTranscription(transcribe_non_speech = False,
|
87 |
-
max_silent_period=vadMergeWindow, max_merge_size=vadMaxMergeSize,
|
|
|
88 |
result = skip_gaps.transcribe(source, whisperCallable)
|
89 |
elif (vad == 'periodic-vad'):
|
90 |
# Very simple VAD - mark every 5 minutes as speech. This makes it less likely that Whisper enters an infinite loop, but
|
@@ -197,7 +199,8 @@ def createUi(inputAudioMaxDuration, share=False, server_name: str = None):
|
|
197 |
gr.Dropdown(choices=["transcribe", "translate"], label="Task"),
|
198 |
gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "periodic-vad"], label="VAD"),
|
199 |
gr.Number(label="VAD - Merge Window (s)", precision=0, value=5),
|
200 |
-
gr.Number(label="VAD - Max Merge Size (s)", precision=0, value=150)
|
|
|
201 |
], outputs=[
|
202 |
gr.File(label="Download"),
|
203 |
gr.Text(label="Transcription"),
|
|
|
52 |
self.vad_model = None
|
53 |
self.inputAudioMaxDuration = inputAudioMaxDuration
|
54 |
|
55 |
+
def transcribeFile(self, modelName, languageName, urlData, uploadFile, microphoneData, task, vad, vadMergeWindow, vadMaxMergeSize, vadPadding):
|
56 |
try:
|
57 |
source, sourceName = self.getSource(urlData, uploadFile, microphoneData)
|
58 |
|
|
|
76 |
self.vad_model = VadSileroTranscription()
|
77 |
|
78 |
process_gaps = VadSileroTranscription(transcribe_non_speech = True,
|
79 |
+
max_silent_period=vadMergeWindow, max_merge_size=vadMaxMergeSize,
|
80 |
+
segment_padding_left=vadPadding, segment_padding_right=vadPadding, copy=self.vad_model)
|
81 |
result = process_gaps.transcribe(source, whisperCallable)
|
82 |
elif (vad == 'silero-vad-skip-gaps'):
|
83 |
# Use Silero VAD
|
|
|
85 |
self.vad_model = VadSileroTranscription()
|
86 |
|
87 |
skip_gaps = VadSileroTranscription(transcribe_non_speech = False,
|
88 |
+
max_silent_period=vadMergeWindow, max_merge_size=vadMaxMergeSize,
|
89 |
+
segment_padding_left=vadPadding, segment_padding_right=vadPadding, copy=self.vad_model)
|
90 |
result = skip_gaps.transcribe(source, whisperCallable)
|
91 |
elif (vad == 'periodic-vad'):
|
92 |
# Very simple VAD - mark every 5 minutes as speech. This makes it less likely that Whisper enters an infinite loop, but
|
|
|
199 |
gr.Dropdown(choices=["transcribe", "translate"], label="Task"),
|
200 |
gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "periodic-vad"], label="VAD"),
|
201 |
gr.Number(label="VAD - Merge Window (s)", precision=0, value=5),
|
202 |
+
gr.Number(label="VAD - Max Merge Size (s)", precision=0, value=150),
|
203 |
+
gr.Number(label="VAD - Padding (s)", precision=None, value=1)
|
204 |
], outputs=[
|
205 |
gr.File(label="Download"),
|
206 |
gr.Text(label="Transcription"),
|
docs/options.md
CHANGED
@@ -49,4 +49,10 @@ Select the task - either "transcribe" to transcribe the audio to text, or "trans
|
|
49 |
If set, any adjacent speech sections that are at most this number of seconds apart will be automatically merged.
|
50 |
|
51 |
## VAD - Max Merge Size (s)
|
52 |
-
Disables merging of adjacent speech sections if they are this number of seconds long.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
If set, any adjacent speech sections that are at most this number of seconds apart will be automatically merged.
|
50 |
|
51 |
## VAD - Max Merge Size (s)
|
52 |
+
Disables merging of adjacent speech sections if they are this number of seconds long.
|
53 |
+
|
54 |
+
## VAD - Padding (s)
|
55 |
+
The number of seconds (floating point) to add to the beginning and end of each speech section. Setting this to a number
|
56 |
+
larger than zero ensures that Whisper is more likely to correctly transcribe a sentence in the beginning of
|
57 |
+
a speech section. However, this also increases the probability of Whisper assigning the wrong timestamp
|
58 |
+
to each transcribed line. The default value is 1 second.
|
src/vad.py
CHANGED
@@ -25,9 +25,9 @@ SPEECH_TRESHOLD = 0.3
|
|
25 |
MAX_SILENT_PERIOD = 10 # seconds
|
26 |
MAX_MERGE_SIZE = 150 # Do not create segments larger than 2.5 minutes
|
27 |
|
28 |
-
#
|
29 |
-
SEGMENT_PADDING_LEFT =
|
30 |
-
SEGMENT_PADDING_RIGHT =
|
31 |
|
32 |
# Whether to attempt to transcribe non-speech
|
33 |
TRANSCRIBE_NON_SPEECH = False
|
@@ -38,7 +38,7 @@ MIN_SEGMENT_DURATION = 1
|
|
38 |
VAD_MAX_PROCESSING_CHUNK = 60 * 60 # 60 minutes of audio
|
39 |
|
40 |
class AbstractTranscription(ABC):
|
41 |
-
def __init__(self, segment_padding_left:
|
42 |
self.sampling_rate = 16000
|
43 |
self.segment_padding_left = segment_padding_left
|
44 |
self.segment_padding_right = segment_padding_right
|
@@ -334,7 +334,7 @@ class VadSileroTranscription(AbstractTranscription):
|
|
334 |
|
335 |
# A very simple VAD that just marks every N seconds as speech
|
336 |
class VadPeriodicTranscription(AbstractTranscription):
|
337 |
-
def __init__(self, periodic_duration:
|
338 |
super().__init__()
|
339 |
self.periodic_duration = periodic_duration
|
340 |
|
|
|
25 |
MAX_SILENT_PERIOD = 10 # seconds
|
26 |
MAX_MERGE_SIZE = 150 # Do not create segments larger than 2.5 minutes
|
27 |
|
28 |
+
# Default segment padding
|
29 |
+
SEGMENT_PADDING_LEFT = 1 # Start detected text segment early
|
30 |
+
SEGMENT_PADDING_RIGHT = 1 # End detected segments late
|
31 |
|
32 |
# Whether to attempt to transcribe non-speech
|
33 |
TRANSCRIBE_NON_SPEECH = False
|
|
|
38 |
VAD_MAX_PROCESSING_CHUNK = 60 * 60 # 60 minutes of audio
|
39 |
|
40 |
class AbstractTranscription(ABC):
|
41 |
+
def __init__(self, segment_padding_left: float = None, segment_padding_right = None, max_silent_period: float = None, max_merge_size: float = None, transcribe_non_speech: bool = False):
|
42 |
self.sampling_rate = 16000
|
43 |
self.segment_padding_left = segment_padding_left
|
44 |
self.segment_padding_right = segment_padding_right
|
|
|
334 |
|
335 |
# A very simple VAD that just marks every N seconds as speech
|
336 |
class VadPeriodicTranscription(AbstractTranscription):
|
337 |
+
def __init__(self, periodic_duration: float):
|
338 |
super().__init__()
|
339 |
self.periodic_duration = periodic_duration
|
340 |
|