File size: 1,731 Bytes
c72e80d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
from dataclasses import dataclass, field
@dataclass
class VADHandlerArguments:
thresh: float = field(
default=0.3,
metadata={
"help": "The threshold value for voice activity detection (VAD). Values typically range from 0 to 1, with higher values requiring higher confidence in speech detection."
},
)
sample_rate: int = field(
default=16000,
metadata={
"help": "The sample rate of the audio in Hertz. Default is 16000 Hz, which is a common setting for voice audio."
},
)
min_silence_ms: int = field(
default=250,
metadata={
"help": "Minimum length of silence intervals to be used for segmenting speech. Measured in milliseconds. Default is 250 ms."
},
)
min_speech_ms: int = field(
default=500,
metadata={
"help": "Minimum length of speech segments to be considered valid speech. Measured in milliseconds. Default is 500 ms."
},
)
max_speech_ms: float = field(
default=float("inf"),
metadata={
"help": "Maximum length of continuous speech before forcing a split. Default is infinite, allowing for uninterrupted speech segments."
},
)
speech_pad_ms: int = field(
default=500,
metadata={
"help": "Amount of padding added to the beginning and end of detected speech segments. Measured in milliseconds. Default is 250 ms."
},
)
audio_enhancement: bool = field(
default=False,
metadata={
"help": "improves sound quality by applying techniques like noise reduction, equalization, and echo cancellation. Default is False."
},
)
|