aadnk commited on
Commit
084aa80
·
1 Parent(s): ea7f8cc

Add max merge size in VAD

Browse files

Also make this and the VAD merge window configurable.

Files changed (3) hide show
  1. app.py +16 -9
  2. docs/options.md +45 -0
  3. vad.py +16 -7
app.py CHANGED
@@ -53,7 +53,7 @@ class UI:
53
  self.vad_model = None
54
  self.inputAudioMaxDuration = inputAudioMaxDuration
55
 
56
- def transcribeFile(self, modelName, languageName, urlData, uploadFile, microphoneData, task, vad):
57
  try:
58
  source, sourceName = self.getSource(urlData, uploadFile, microphoneData)
59
 
@@ -74,19 +74,23 @@ class UI:
74
  if (vad == 'silero-vad'):
75
  # Use Silero VAD and include gaps
76
  if (self.vad_model is None):
77
- self.vad_model = VadSileroTranscription(transcribe_non_speech= True)
78
- result = self.vad_model.transcribe(source, whisperCallable)
 
 
 
79
  elif (vad == 'silero-vad-skip-gaps'):
80
  # Use Silero VAD
81
  if (self.vad_model is None):
82
- self.vad_model = VadSileroTranscription(transcribe_non_speech= True)
83
 
84
- skip_gaps = VadSileroTranscription(transcribe_non_speech = False, copy=self.vad_model)
 
85
  result = skip_gaps.transcribe(source, whisperCallable)
86
  elif (vad == 'periodic-vad'):
87
  # Very simple VAD - mark every 5 minutes as speech. This makes it less likely that Whisper enters an infinite loop, but
88
  # it may create a break in the middle of a sentence, causing some artifacts.
89
- periodic_vad = VadPeriodicTranscription(periodic_duration=60 * 5)
90
  result = periodic_vad.transcribe(source, whisperCallable)
91
  else:
92
  # Default VAD
@@ -178,13 +182,14 @@ def createUi(inputAudioMaxDuration, share=False, server_name: str = None):
178
  ui_description += " audio and is also a multi-task model that can perform multilingual speech recognition "
179
  ui_description += " as well as speech translation and language identification. "
180
 
181
- ui_description += "\n\n" + "Note: You can upload more audio (and even video) types by changing to All Files (*.*) in the file selector. For longer audio files (>10 minutes), "
182
- ui_description += "it is recommended that you select Silero VAD (Voice Activity Detector) in the VAD option."
183
 
184
  if inputAudioMaxDuration > 0:
185
  ui_description += "\n\n" + "Max audio file length: " + str(inputAudioMaxDuration) + " s"
186
 
187
- demo = gr.Interface(fn=ui.transcribeFile, description=ui_description, inputs=[
 
 
188
  gr.Dropdown(choices=["tiny", "base", "small", "medium", "large"], value="medium", label="Model"),
189
  gr.Dropdown(choices=sorted(LANGUAGES), label="Language"),
190
  gr.Text(label="URL (YouTube, etc.)"),
@@ -192,6 +197,8 @@ def createUi(inputAudioMaxDuration, share=False, server_name: str = None):
192
  gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
193
  gr.Dropdown(choices=["transcribe", "translate"], label="Task"),
194
  gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "periodic-vad"], label="VAD"),
 
 
195
  ], outputs=[
196
  gr.File(label="Download"),
197
  gr.Text(label="Transcription"),
 
53
  self.vad_model = None
54
  self.inputAudioMaxDuration = inputAudioMaxDuration
55
 
56
+ def transcribeFile(self, modelName, languageName, urlData, uploadFile, microphoneData, task, vad, vadMergeWindow, vadMaxMergeSize):
57
  try:
58
  source, sourceName = self.getSource(urlData, uploadFile, microphoneData)
59
 
 
74
  if (vad == 'silero-vad'):
75
  # Use Silero VAD and include gaps
76
  if (self.vad_model is None):
77
+ self.vad_model = VadSileroTranscription()
78
+
79
+ process_gaps = VadSileroTranscription(transcribe_non_speech = True,
80
+ max_silent_period=vadMergeWindow, max_merge_size=vadMaxMergeSize, copy=self.vad_model)
81
+ result = process_gaps.transcribe(source, whisperCallable)
82
  elif (vad == 'silero-vad-skip-gaps'):
83
  # Use Silero VAD
84
  if (self.vad_model is None):
85
+ self.vad_model = VadSileroTranscription()
86
 
87
+ skip_gaps = VadSileroTranscription(transcribe_non_speech = False,
88
+ max_silent_period=vadMergeWindow, max_merge_size=vadMaxMergeSize, copy=self.vad_model)
89
  result = skip_gaps.transcribe(source, whisperCallable)
90
  elif (vad == 'periodic-vad'):
91
  # Very simple VAD - mark every 5 minutes as speech. This makes it less likely that Whisper enters an infinite loop, but
92
  # it may create a break in the middle of a sentence, causing some artifacts.
93
+ periodic_vad = VadPeriodicTranscription(periodic_duration=vadMaxMergeSize)
94
  result = periodic_vad.transcribe(source, whisperCallable)
95
  else:
96
  # Default VAD
 
182
  ui_description += " audio and is also a multi-task model that can perform multilingual speech recognition "
183
  ui_description += " as well as speech translation and language identification. "
184
 
185
+ ui_description += "\n\n\n\nFor longer audio files (>10 minutes), it is recommended that you select Silero VAD (Voice Activity Detector) in the VAD option."
 
186
 
187
  if inputAudioMaxDuration > 0:
188
  ui_description += "\n\n" + "Max audio file length: " + str(inputAudioMaxDuration) + " s"
189
 
190
+ ui_article = "Read the [documentation her](https://huggingface.co/spaces/aadnk/whisper-webui/blob/main/docs/options.md)"
191
+
192
+ demo = gr.Interface(fn=ui.transcribeFile, description=ui_description, article=ui_article, inputs=[
193
  gr.Dropdown(choices=["tiny", "base", "small", "medium", "large"], value="medium", label="Model"),
194
  gr.Dropdown(choices=sorted(LANGUAGES), label="Language"),
195
  gr.Text(label="URL (YouTube, etc.)"),
 
197
  gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
198
  gr.Dropdown(choices=["transcribe", "translate"], label="Task"),
199
  gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "periodic-vad"], label="VAD"),
200
+ gr.Number(label="VAD - Merge Window (s)", precision=0, value=10),
201
+ gr.Number(label="VAD - Max Merge Size (s)", precision=0, value=150)
202
  ], outputs=[
203
  gr.File(label="Download"),
204
  gr.Text(label="Transcription"),
docs/options.md ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Options
2
+ To transcribe or translate an audio file, you can either copy an URL from a website (all [websites](https://github.com/yt-dlp/yt-dlp/blob/master/supportedsites.md) supported by YT-DLP will work, including YouTube). Otherwise, upload an audio file (choose "All Files (*.*)" in the file selector to select any file type, including video files) or use the microphone.
3
+
4
+ For longer audio files (>10 minutes), it is recommended that you select Silero VAD (Voice Activity Detector) in the VAD option.
5
+
6
+ ## Model
7
+ Select the model that Whisper will use to transcribe the audio:
8
+
9
+ | Size | Parameters | English-only model | Multilingual model | Required VRAM | Relative speed |
10
+ |--------|------------|--------------------|--------------------|---------------|----------------|
11
+ | tiny | 39 M | tiny.en | tiny | ~1 GB | ~32x |
12
+ | base | 74 M | base.en | base | ~1 GB | ~16x |
13
+ | small | 244 M | small.en | small | ~2 GB | ~6x |
14
+ | medium | 769 M | medium.en | medium | ~5 GB | ~2x |
15
+ | large | 1550 M | N/A | large | ~10 GB | 1x |
16
+
17
+ ## Language
18
+
19
+ Select the language, or leave it empty for Whisper to automatically detect it.
20
+
21
+ Note that if the selected language and the language in the audio differs, Whisper may start to translate the audio to the selected language. For instance, if the audio is in English but you select Japaneese, the model may translate the audio to Japanese.
22
+
23
+ ## Inputs
24
+ The options "URL (YouTube, etc.)", "Upload Audio" or "Micriphone Input" allows you to send an audio input to the model.
25
+
26
+ Note that the UI will only process the first valid input - i.e. if you enter both an URL and upload an audio, it will only process the URL.
27
+
28
+ ## Task
29
+ Select the task - either "transcribe" to transcribe the audio to text, or "translate" to translate it to English.
30
+
31
+ ## Vad
32
+ * none
33
+ * Run whisper on the entire audio input
34
+ * silero-vad
35
+ * Use Silero VAD to detect sections that contain speech, and run whisper on independently on each section. Whisper is also run on the gaps between each speech section.
36
+ * silero-vad-skip-gaps
37
+ * As above, but sections that doesn't contain speech according to Silero will be skipped. This will be slightly faster, but may cause dialogue to be skipped.
38
+ * periodic-vad
39
+ * Create sections of speech every 'VAD - Max Merge Size' seconds. This is very fast and simple, but will potentially break a sentence or word in two.
40
+
41
+ ## VAD - Merge Window
42
+ If set, any adjacent speech sections that are at most this number of seconds apart will be automatically merged."
43
+
44
+ ## VAD - Max Merge Size (s)
45
+ Disables merging of adjacent speech sections if they are this number of seconds long."
vad.py CHANGED
@@ -20,8 +20,12 @@ import numpy as np
20
  from utils import format_timestamp
21
 
22
  # Defaults for Silero
 
 
23
  SPEECH_TRESHOLD = 0.3
24
  MAX_SILENT_PERIOD = 10 # seconds
 
 
25
  SEGMENT_PADDING_LEFT = 1 # Start detected text segment early
26
  SEGMENT_PADDING_RIGHT = 3 # End detected segments late
27
 
@@ -29,11 +33,12 @@ SEGMENT_PADDING_RIGHT = 3 # End detected segments late
29
  TRANSCRIBE_NON_SPEECH = False
30
 
31
  class AbstractTranscription(ABC):
32
- def __init__(self, segment_padding_left: int = None, segment_padding_right = None, max_silent_period: int = None, transcribe_non_speech: bool = False):
33
  self.sampling_rate = 16000
34
  self.segment_padding_left = segment_padding_left
35
  self.segment_padding_right = segment_padding_right
36
  self.max_silent_period = max_silent_period
 
37
  self.transcribe_non_speech = transcribe_non_speech
38
 
39
  def get_audio_segment(self, str, start_time: str = None, duration: str = None):
@@ -76,7 +81,7 @@ class AbstractTranscription(ABC):
76
  seconds_timestamps = self.get_transcribe_timestamps(audio)
77
 
78
  padded = self.pad_timestamps(seconds_timestamps, self.segment_padding_left, self.segment_padding_right)
79
- merged = self.merge_timestamps(padded, self.max_silent_period)
80
 
81
  print("Timestamps:")
82
  pprint(merged)
@@ -188,8 +193,8 @@ class AbstractTranscription(ABC):
188
 
189
  return result
190
 
191
- def merge_timestamps(self, timestamps: List[Dict[str, Any]], max_distance: float):
192
- if max_distance is None:
193
  return timestamps
194
 
195
  result = []
@@ -202,8 +207,9 @@ class AbstractTranscription(ABC):
202
 
203
  # Get distance to the previous entry
204
  distance = entry['start'] - current_entry['end']
 
205
 
206
- if distance <= max_distance:
207
  # Merge
208
  current_entry['end'] = entry['end']
209
  else:
@@ -231,8 +237,11 @@ class AbstractTranscription(ABC):
231
  return result
232
 
233
  class VadSileroTranscription(AbstractTranscription):
234
- def __init__(self, transcribe_non_speech: bool = False, copy = None):
235
- super().__init__(SEGMENT_PADDING_LEFT, SEGMENT_PADDING_RIGHT, MAX_SILENT_PERIOD, transcribe_non_speech)
 
 
 
236
 
237
  if copy:
238
  self.model = copy.model
 
20
  from utils import format_timestamp
21
 
22
  # Defaults for Silero
23
+ # TODO: Make these configurable?
24
+
25
  SPEECH_TRESHOLD = 0.3
26
  MAX_SILENT_PERIOD = 10 # seconds
27
+ MAX_MERGE_SIZE = 150 # Do not create segments larger than 2.5 minutes
28
+
29
  SEGMENT_PADDING_LEFT = 1 # Start detected text segment early
30
  SEGMENT_PADDING_RIGHT = 3 # End detected segments late
31
 
 
33
  TRANSCRIBE_NON_SPEECH = False
34
 
35
  class AbstractTranscription(ABC):
36
+ def __init__(self, segment_padding_left: int = None, segment_padding_right = None, max_silent_period: int = None, max_merge_size: int = None, transcribe_non_speech: bool = False):
37
  self.sampling_rate = 16000
38
  self.segment_padding_left = segment_padding_left
39
  self.segment_padding_right = segment_padding_right
40
  self.max_silent_period = max_silent_period
41
+ self.max_merge_size = max_merge_size
42
  self.transcribe_non_speech = transcribe_non_speech
43
 
44
  def get_audio_segment(self, str, start_time: str = None, duration: str = None):
 
81
  seconds_timestamps = self.get_transcribe_timestamps(audio)
82
 
83
  padded = self.pad_timestamps(seconds_timestamps, self.segment_padding_left, self.segment_padding_right)
84
+ merged = self.merge_timestamps(padded, self.max_silent_period, self.max_merge_size)
85
 
86
  print("Timestamps:")
87
  pprint(merged)
 
193
 
194
  return result
195
 
196
+ def merge_timestamps(self, timestamps: List[Dict[str, Any]], max_merge_gap: float, max_merge_size: float):
197
+ if max_merge_gap is None:
198
  return timestamps
199
 
200
  result = []
 
207
 
208
  # Get distance to the previous entry
209
  distance = entry['start'] - current_entry['end']
210
+ current_entry_size = current_entry['end'] - current_entry['start']
211
 
212
+ if distance <= max_merge_gap and (max_merge_size is None or current_entry_size <= max_merge_size):
213
  # Merge
214
  current_entry['end'] = entry['end']
215
  else:
 
237
  return result
238
 
239
  class VadSileroTranscription(AbstractTranscription):
240
+ def __init__(self, segment_padding_left=SEGMENT_PADDING_LEFT, segment_padding_right=SEGMENT_PADDING_RIGHT,
241
+ max_silent_period=MAX_SILENT_PERIOD, max_merge_size=MAX_MERGE_SIZE, transcribe_non_speech: bool = False,
242
+ copy = None):
243
+ super().__init__(segment_padding_left=segment_padding_left, segment_padding_right=segment_padding_right,
244
+ max_silent_period=max_silent_period, max_merge_size=max_merge_size, transcribe_non_speech=transcribe_non_speech)
245
 
246
  if copy:
247
  self.model = copy.model