Add max merge size in VAD
Browse filesAlso make this and the VAD merge window configurable.
- app.py +16 -9
- docs/options.md +45 -0
- vad.py +16 -7
app.py
CHANGED
@@ -53,7 +53,7 @@ class UI:
|
|
53 |
self.vad_model = None
|
54 |
self.inputAudioMaxDuration = inputAudioMaxDuration
|
55 |
|
56 |
-
def transcribeFile(self, modelName, languageName, urlData, uploadFile, microphoneData, task, vad):
|
57 |
try:
|
58 |
source, sourceName = self.getSource(urlData, uploadFile, microphoneData)
|
59 |
|
@@ -74,19 +74,23 @@ class UI:
|
|
74 |
if (vad == 'silero-vad'):
|
75 |
# Use Silero VAD and include gaps
|
76 |
if (self.vad_model is None):
|
77 |
-
self.vad_model = VadSileroTranscription(
|
78 |
-
|
|
|
|
|
|
|
79 |
elif (vad == 'silero-vad-skip-gaps'):
|
80 |
# Use Silero VAD
|
81 |
if (self.vad_model is None):
|
82 |
-
self.vad_model = VadSileroTranscription(
|
83 |
|
84 |
-
skip_gaps = VadSileroTranscription(transcribe_non_speech = False,
|
|
|
85 |
result = skip_gaps.transcribe(source, whisperCallable)
|
86 |
elif (vad == 'periodic-vad'):
|
87 |
# Very simple VAD - mark every 5 minutes as speech. This makes it less likely that Whisper enters an infinite loop, but
|
88 |
# it may create a break in the middle of a sentence, causing some artifacts.
|
89 |
-
periodic_vad = VadPeriodicTranscription(periodic_duration=
|
90 |
result = periodic_vad.transcribe(source, whisperCallable)
|
91 |
else:
|
92 |
# Default VAD
|
@@ -178,13 +182,14 @@ def createUi(inputAudioMaxDuration, share=False, server_name: str = None):
|
|
178 |
ui_description += " audio and is also a multi-task model that can perform multilingual speech recognition "
|
179 |
ui_description += " as well as speech translation and language identification. "
|
180 |
|
181 |
-
ui_description += "\n\n
|
182 |
-
ui_description += "it is recommended that you select Silero VAD (Voice Activity Detector) in the VAD option."
|
183 |
|
184 |
if inputAudioMaxDuration > 0:
|
185 |
ui_description += "\n\n" + "Max audio file length: " + str(inputAudioMaxDuration) + " s"
|
186 |
|
187 |
-
|
|
|
|
|
188 |
gr.Dropdown(choices=["tiny", "base", "small", "medium", "large"], value="medium", label="Model"),
|
189 |
gr.Dropdown(choices=sorted(LANGUAGES), label="Language"),
|
190 |
gr.Text(label="URL (YouTube, etc.)"),
|
@@ -192,6 +197,8 @@ def createUi(inputAudioMaxDuration, share=False, server_name: str = None):
|
|
192 |
gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
|
193 |
gr.Dropdown(choices=["transcribe", "translate"], label="Task"),
|
194 |
gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "periodic-vad"], label="VAD"),
|
|
|
|
|
195 |
], outputs=[
|
196 |
gr.File(label="Download"),
|
197 |
gr.Text(label="Transcription"),
|
|
|
53 |
self.vad_model = None
|
54 |
self.inputAudioMaxDuration = inputAudioMaxDuration
|
55 |
|
56 |
+
def transcribeFile(self, modelName, languageName, urlData, uploadFile, microphoneData, task, vad, vadMergeWindow, vadMaxMergeSize):
|
57 |
try:
|
58 |
source, sourceName = self.getSource(urlData, uploadFile, microphoneData)
|
59 |
|
|
|
74 |
if (vad == 'silero-vad'):
|
75 |
# Use Silero VAD and include gaps
|
76 |
if (self.vad_model is None):
|
77 |
+
self.vad_model = VadSileroTranscription()
|
78 |
+
|
79 |
+
process_gaps = VadSileroTranscription(transcribe_non_speech = True,
|
80 |
+
max_silent_period=vadMergeWindow, max_merge_size=vadMaxMergeSize, copy=self.vad_model)
|
81 |
+
result = process_gaps.transcribe(source, whisperCallable)
|
82 |
elif (vad == 'silero-vad-skip-gaps'):
|
83 |
# Use Silero VAD
|
84 |
if (self.vad_model is None):
|
85 |
+
self.vad_model = VadSileroTranscription()
|
86 |
|
87 |
+
skip_gaps = VadSileroTranscription(transcribe_non_speech = False,
|
88 |
+
max_silent_period=vadMergeWindow, max_merge_size=vadMaxMergeSize, copy=self.vad_model)
|
89 |
result = skip_gaps.transcribe(source, whisperCallable)
|
90 |
elif (vad == 'periodic-vad'):
|
91 |
# Very simple VAD - mark every 5 minutes as speech. This makes it less likely that Whisper enters an infinite loop, but
|
92 |
# it may create a break in the middle of a sentence, causing some artifacts.
|
93 |
+
periodic_vad = VadPeriodicTranscription(periodic_duration=vadMaxMergeSize)
|
94 |
result = periodic_vad.transcribe(source, whisperCallable)
|
95 |
else:
|
96 |
# Default VAD
|
|
|
182 |
ui_description += " audio and is also a multi-task model that can perform multilingual speech recognition "
|
183 |
ui_description += " as well as speech translation and language identification. "
|
184 |
|
185 |
+
ui_description += "\n\n\n\nFor longer audio files (>10 minutes), it is recommended that you select Silero VAD (Voice Activity Detector) in the VAD option."
|
|
|
186 |
|
187 |
if inputAudioMaxDuration > 0:
|
188 |
ui_description += "\n\n" + "Max audio file length: " + str(inputAudioMaxDuration) + " s"
|
189 |
|
190 |
+
ui_article = "Read the [documentation her](https://huggingface.co/spaces/aadnk/whisper-webui/blob/main/docs/options.md)"
|
191 |
+
|
192 |
+
demo = gr.Interface(fn=ui.transcribeFile, description=ui_description, article=ui_article, inputs=[
|
193 |
gr.Dropdown(choices=["tiny", "base", "small", "medium", "large"], value="medium", label="Model"),
|
194 |
gr.Dropdown(choices=sorted(LANGUAGES), label="Language"),
|
195 |
gr.Text(label="URL (YouTube, etc.)"),
|
|
|
197 |
gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
|
198 |
gr.Dropdown(choices=["transcribe", "translate"], label="Task"),
|
199 |
gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "periodic-vad"], label="VAD"),
|
200 |
+
gr.Number(label="VAD - Merge Window (s)", precision=0, value=10),
|
201 |
+
gr.Number(label="VAD - Max Merge Size (s)", precision=0, value=150)
|
202 |
], outputs=[
|
203 |
gr.File(label="Download"),
|
204 |
gr.Text(label="Transcription"),
|
docs/options.md
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Options
|
2 |
+
To transcribe or translate an audio file, you can either copy an URL from a website (all [websites](https://github.com/yt-dlp/yt-dlp/blob/master/supportedsites.md) supported by YT-DLP will work, including YouTube). Otherwise, upload an audio file (choose "All Files (*.*)" in the file selector to select any file type, including video files) or use the microphone.
|
3 |
+
|
4 |
+
For longer audio files (>10 minutes), it is recommended that you select Silero VAD (Voice Activity Detector) in the VAD option.
|
5 |
+
|
6 |
+
## Model
|
7 |
+
Select the model that Whisper will use to transcribe the audio:
|
8 |
+
|
9 |
+
| Size | Parameters | English-only model | Multilingual model | Required VRAM | Relative speed |
|
10 |
+
|--------|------------|--------------------|--------------------|---------------|----------------|
|
11 |
+
| tiny | 39 M | tiny.en | tiny | ~1 GB | ~32x |
|
12 |
+
| base | 74 M | base.en | base | ~1 GB | ~16x |
|
13 |
+
| small | 244 M | small.en | small | ~2 GB | ~6x |
|
14 |
+
| medium | 769 M | medium.en | medium | ~5 GB | ~2x |
|
15 |
+
| large | 1550 M | N/A | large | ~10 GB | 1x |
|
16 |
+
|
17 |
+
## Language
|
18 |
+
|
19 |
+
Select the language, or leave it empty for Whisper to automatically detect it.
|
20 |
+
|
21 |
+
Note that if the selected language and the language in the audio differs, Whisper may start to translate the audio to the selected language. For instance, if the audio is in English but you select Japaneese, the model may translate the audio to Japanese.
|
22 |
+
|
23 |
+
## Inputs
|
24 |
+
The options "URL (YouTube, etc.)", "Upload Audio" or "Micriphone Input" allows you to send an audio input to the model.
|
25 |
+
|
26 |
+
Note that the UI will only process the first valid input - i.e. if you enter both an URL and upload an audio, it will only process the URL.
|
27 |
+
|
28 |
+
## Task
|
29 |
+
Select the task - either "transcribe" to transcribe the audio to text, or "translate" to translate it to English.
|
30 |
+
|
31 |
+
## Vad
|
32 |
+
* none
|
33 |
+
* Run whisper on the entire audio input
|
34 |
+
* silero-vad
|
35 |
+
* Use Silero VAD to detect sections that contain speech, and run whisper on independently on each section. Whisper is also run on the gaps between each speech section.
|
36 |
+
* silero-vad-skip-gaps
|
37 |
+
* As above, but sections that doesn't contain speech according to Silero will be skipped. This will be slightly faster, but may cause dialogue to be skipped.
|
38 |
+
* periodic-vad
|
39 |
+
* Create sections of speech every 'VAD - Max Merge Size' seconds. This is very fast and simple, but will potentially break a sentence or word in two.
|
40 |
+
|
41 |
+
## VAD - Merge Window
|
42 |
+
If set, any adjacent speech sections that are at most this number of seconds apart will be automatically merged."
|
43 |
+
|
44 |
+
## VAD - Max Merge Size (s)
|
45 |
+
Disables merging of adjacent speech sections if they are this number of seconds long."
|
vad.py
CHANGED
@@ -20,8 +20,12 @@ import numpy as np
|
|
20 |
from utils import format_timestamp
|
21 |
|
22 |
# Defaults for Silero
|
|
|
|
|
23 |
SPEECH_TRESHOLD = 0.3
|
24 |
MAX_SILENT_PERIOD = 10 # seconds
|
|
|
|
|
25 |
SEGMENT_PADDING_LEFT = 1 # Start detected text segment early
|
26 |
SEGMENT_PADDING_RIGHT = 3 # End detected segments late
|
27 |
|
@@ -29,11 +33,12 @@ SEGMENT_PADDING_RIGHT = 3 # End detected segments late
|
|
29 |
TRANSCRIBE_NON_SPEECH = False
|
30 |
|
31 |
class AbstractTranscription(ABC):
|
32 |
-
def __init__(self, segment_padding_left: int = None, segment_padding_right = None, max_silent_period: int = None, transcribe_non_speech: bool = False):
|
33 |
self.sampling_rate = 16000
|
34 |
self.segment_padding_left = segment_padding_left
|
35 |
self.segment_padding_right = segment_padding_right
|
36 |
self.max_silent_period = max_silent_period
|
|
|
37 |
self.transcribe_non_speech = transcribe_non_speech
|
38 |
|
39 |
def get_audio_segment(self, str, start_time: str = None, duration: str = None):
|
@@ -76,7 +81,7 @@ class AbstractTranscription(ABC):
|
|
76 |
seconds_timestamps = self.get_transcribe_timestamps(audio)
|
77 |
|
78 |
padded = self.pad_timestamps(seconds_timestamps, self.segment_padding_left, self.segment_padding_right)
|
79 |
-
merged = self.merge_timestamps(padded, self.max_silent_period)
|
80 |
|
81 |
print("Timestamps:")
|
82 |
pprint(merged)
|
@@ -188,8 +193,8 @@ class AbstractTranscription(ABC):
|
|
188 |
|
189 |
return result
|
190 |
|
191 |
-
def merge_timestamps(self, timestamps: List[Dict[str, Any]],
|
192 |
-
if
|
193 |
return timestamps
|
194 |
|
195 |
result = []
|
@@ -202,8 +207,9 @@ class AbstractTranscription(ABC):
|
|
202 |
|
203 |
# Get distance to the previous entry
|
204 |
distance = entry['start'] - current_entry['end']
|
|
|
205 |
|
206 |
-
if distance <=
|
207 |
# Merge
|
208 |
current_entry['end'] = entry['end']
|
209 |
else:
|
@@ -231,8 +237,11 @@ class AbstractTranscription(ABC):
|
|
231 |
return result
|
232 |
|
233 |
class VadSileroTranscription(AbstractTranscription):
|
234 |
-
def __init__(self,
|
235 |
-
|
|
|
|
|
|
|
236 |
|
237 |
if copy:
|
238 |
self.model = copy.model
|
|
|
20 |
from utils import format_timestamp
|
21 |
|
22 |
# Defaults for Silero
|
23 |
+
# TODO: Make these configurable?
|
24 |
+
|
25 |
SPEECH_TRESHOLD = 0.3
|
26 |
MAX_SILENT_PERIOD = 10 # seconds
|
27 |
+
MAX_MERGE_SIZE = 150 # Do not create segments larger than 2.5 minutes
|
28 |
+
|
29 |
SEGMENT_PADDING_LEFT = 1 # Start detected text segment early
|
30 |
SEGMENT_PADDING_RIGHT = 3 # End detected segments late
|
31 |
|
|
|
33 |
TRANSCRIBE_NON_SPEECH = False
|
34 |
|
35 |
class AbstractTranscription(ABC):
|
36 |
+
def __init__(self, segment_padding_left: int = None, segment_padding_right = None, max_silent_period: int = None, max_merge_size: int = None, transcribe_non_speech: bool = False):
|
37 |
self.sampling_rate = 16000
|
38 |
self.segment_padding_left = segment_padding_left
|
39 |
self.segment_padding_right = segment_padding_right
|
40 |
self.max_silent_period = max_silent_period
|
41 |
+
self.max_merge_size = max_merge_size
|
42 |
self.transcribe_non_speech = transcribe_non_speech
|
43 |
|
44 |
def get_audio_segment(self, str, start_time: str = None, duration: str = None):
|
|
|
81 |
seconds_timestamps = self.get_transcribe_timestamps(audio)
|
82 |
|
83 |
padded = self.pad_timestamps(seconds_timestamps, self.segment_padding_left, self.segment_padding_right)
|
84 |
+
merged = self.merge_timestamps(padded, self.max_silent_period, self.max_merge_size)
|
85 |
|
86 |
print("Timestamps:")
|
87 |
pprint(merged)
|
|
|
193 |
|
194 |
return result
|
195 |
|
196 |
+
def merge_timestamps(self, timestamps: List[Dict[str, Any]], max_merge_gap: float, max_merge_size: float):
|
197 |
+
if max_merge_gap is None:
|
198 |
return timestamps
|
199 |
|
200 |
result = []
|
|
|
207 |
|
208 |
# Get distance to the previous entry
|
209 |
distance = entry['start'] - current_entry['end']
|
210 |
+
current_entry_size = current_entry['end'] - current_entry['start']
|
211 |
|
212 |
+
if distance <= max_merge_gap and (max_merge_size is None or current_entry_size <= max_merge_size):
|
213 |
# Merge
|
214 |
current_entry['end'] = entry['end']
|
215 |
else:
|
|
|
237 |
return result
|
238 |
|
239 |
class VadSileroTranscription(AbstractTranscription):
|
240 |
+
def __init__(self, segment_padding_left=SEGMENT_PADDING_LEFT, segment_padding_right=SEGMENT_PADDING_RIGHT,
|
241 |
+
max_silent_period=MAX_SILENT_PERIOD, max_merge_size=MAX_MERGE_SIZE, transcribe_non_speech: bool = False,
|
242 |
+
copy = None):
|
243 |
+
super().__init__(segment_padding_left=segment_padding_left, segment_padding_right=segment_padding_right,
|
244 |
+
max_silent_period=max_silent_period, max_merge_size=max_merge_size, transcribe_non_speech=transcribe_non_speech)
|
245 |
|
246 |
if copy:
|
247 |
self.model = copy.model
|