aadnk commited on
Commit
b604ea1
·
1 Parent(s): 2ea4ed3

Process VAD in chunks of up to 1 hour

Browse files
Files changed (1) hide show
  1. src/vad.py +28 -7
src/vad.py CHANGED
@@ -34,6 +34,8 @@ TRANSCRIBE_NON_SPEECH = False
34
  # Minimum size of segments to process
35
  MIN_SEGMENT_DURATION = 1
36
 
 
 
37
  class AbstractTranscription(ABC):
38
  def __init__(self, segment_padding_left: int = None, segment_padding_right = None, max_silent_period: int = None, max_merge_size: int = None, transcribe_non_speech: bool = False):
39
  self.sampling_rate = 16000
@@ -89,7 +91,7 @@ class AbstractTranscription(ABC):
89
  pprint(merged)
90
 
91
  if self.transcribe_non_speech:
92
- max_audio_duration = float(ffmpeg.probe(audio)["format"]["duration"])
93
 
94
  # Expand segments to include the gaps between them
95
  merged = self.expand_gaps(merged, total_duration=max_audio_duration)
@@ -120,7 +122,7 @@ class AbstractTranscription(ABC):
120
  print("Running whisper from ", format_timestamp(segment_start), " to ", format_timestamp(segment_end), ", duration: ", segment_duration, "expanded: ", segment_expand_amount)
121
  segment_result = whisperCallable(segment_audio)
122
 
123
- adjusted_segments = self.adjust_whisper_timestamp(segment_result["segments"], adjust_seconds=segment_start, max_source_time=segment_duration)
124
 
125
  # Append to output
126
  result['text'] += segment_result['text']
@@ -198,7 +200,7 @@ class AbstractTranscription(ABC):
198
 
199
  return result
200
 
201
- def adjust_whisper_timestamp(self, segments: Iterator[dict], adjust_seconds: float, max_source_time: float = None):
202
  result = []
203
 
204
  for segment in segments:
@@ -303,10 +305,26 @@ class VadSileroTranscription(AbstractTranscription):
303
  (self.get_speech_timestamps, _, _, _, _) = utils
304
 
305
  def get_transcribe_timestamps(self, audio: str):
306
- wav = self.get_audio_segment(audio)
 
 
 
 
 
 
 
307
 
308
- sample_timestamps = self.get_speech_timestamps(wav, self.model, sampling_rate=self.sampling_rate, threshold=SPEECH_TRESHOLD)
309
- seconds_timestamps = self.multiply_timestamps(sample_timestamps, factor=1 / self.sampling_rate)
 
 
 
 
 
 
 
 
 
310
 
311
  return seconds_timestamps
312
 
@@ -318,7 +336,7 @@ class VadPeriodicTranscription(AbstractTranscription):
318
 
319
  def get_transcribe_timestamps(self, audio: str):
320
  # Get duration in seconds
321
- audio_duration = float(ffmpeg.probe(audio)["format"]["duration"])
322
  result = []
323
 
324
  # Generate a timestamp every N seconds
@@ -336,6 +354,9 @@ class VadPeriodicTranscription(AbstractTranscription):
336
 
337
  return result
338
 
 
 
 
339
  def load_audio(file: str, sample_rate: int = 16000,
340
  start_time: str = None, duration: str = None):
341
  """
 
34
  # Minimum size of segments to process
35
  MIN_SEGMENT_DURATION = 1
36
 
37
+ VAD_MAX_PROCESSING_CHUNK = 60 * 60 # 60 minutes of audio
38
+
39
  class AbstractTranscription(ABC):
40
  def __init__(self, segment_padding_left: int = None, segment_padding_right = None, max_silent_period: int = None, max_merge_size: int = None, transcribe_non_speech: bool = False):
41
  self.sampling_rate = 16000
 
91
  pprint(merged)
92
 
93
  if self.transcribe_non_speech:
94
+ max_audio_duration = get_audio_duration(audio)
95
 
96
  # Expand segments to include the gaps between them
97
  merged = self.expand_gaps(merged, total_duration=max_audio_duration)
 
122
  print("Running whisper from ", format_timestamp(segment_start), " to ", format_timestamp(segment_end), ", duration: ", segment_duration, "expanded: ", segment_expand_amount)
123
  segment_result = whisperCallable(segment_audio)
124
 
125
+ adjusted_segments = self.adjust_timestamp(segment_result["segments"], adjust_seconds=segment_start, max_source_time=segment_duration)
126
 
127
  # Append to output
128
  result['text'] += segment_result['text']
 
200
 
201
  return result
202
 
203
+ def adjust_timestamp(self, segments: Iterator[dict], adjust_seconds: float, max_source_time: float = None):
204
  result = []
205
 
206
  for segment in segments:
 
305
  (self.get_speech_timestamps, _, _, _, _) = utils
306
 
307
  def get_transcribe_timestamps(self, audio: str):
308
+ audio_duration = get_audio_duration(audio)
309
+ result = []
310
+
311
+ # Divide procesisng of audio into chunks
312
+ chunk_start = 0.0
313
+
314
+ while (chunk_start < audio_duration):
315
+ chunk_duration = min(audio_duration - chunk_start, VAD_MAX_PROCESSING_CHUNK)
316
 
317
+ print("Processing VAD in chunk from {} to {}".format(format_timestamp(chunk_start), format_timestamp(chunk_start + chunk_duration)))
318
+ wav = self.get_audio_segment(audio, str(chunk_start), str(chunk_duration))
319
+
320
+ sample_timestamps = self.get_speech_timestamps(wav, self.model, sampling_rate=self.sampling_rate, threshold=SPEECH_TRESHOLD)
321
+ seconds_timestamps = self.multiply_timestamps(sample_timestamps, factor=1 / self.sampling_rate)
322
+ adjusted = self.adjust_timestamp(seconds_timestamps, adjust_seconds=chunk_start, max_source_time=chunk_start + chunk_duration)
323
+
324
+ pprint(adjusted)
325
+
326
+ result.extend(adjusted)
327
+ chunk_start += chunk_duration
328
 
329
  return seconds_timestamps
330
 
 
336
 
337
  def get_transcribe_timestamps(self, audio: str):
338
  # Get duration in seconds
339
+ audio_duration = get_audio_duration(audio)
340
  result = []
341
 
342
  # Generate a timestamp every N seconds
 
354
 
355
  return result
356
 
357
+ def get_audio_duration(file: str):
358
+ return float(ffmpeg.probe(file)["format"]["duration"])
359
+
360
  def load_audio(file: str, sample_rate: int = 16000,
361
  start_time: str = None, duration: str = None):
362
  """