IliaLarchenko commited on
Commit
b989f04
·
1 Parent(s): 6bb887d

Cleaned up audio.py

Browse files
Files changed (1) hide show
  1. api/audio.py +5 -68
api/audio.py CHANGED
@@ -41,7 +41,7 @@ class STTManager:
41
 
42
  self.config = config
43
  self.status = self.test_stt()
44
- self.streaming = self.test_streaming()
45
 
46
  def numpy_audio_to_bytes(self, audio_data: np.ndarray) -> bytes:
47
  """
@@ -70,8 +70,7 @@ class STTManager:
70
 
71
  :param audio: Tuple containing the sample rate and audio data as numpy array.
72
  :param audio_buffer: Current audio buffer as numpy array.
73
- :param transcript: Current transcript dictionary.
74
- :return: Updated transcript, updated audio buffer, and transcript text.
75
  """
76
 
77
  has_voice = detect_voice(audio[1])
@@ -87,69 +86,19 @@ class STTManager:
87
 
88
  return np.array([], dtype=np.int16), audio_buffer
89
 
90
- def transcribe_audio(self, audio: np.ndarray, text) -> str:
91
  if len(audio) < 500:
92
  return text
93
  else:
94
  transcript = self.transcribe_numpy_array(audio, context=text)
95
  return text + " " + transcript
96
 
97
- def speech_to_text_stream(self, audio: bytes) -> List[Dict[str, str]]:
98
- """
99
- Convert speech to text from a byte stream using streaming.
100
-
101
- :param audio: Bytes representation of audio data.
102
- :return: List of dictionaries containing transcribed words and their timestamps.
103
- """
104
- if self.config.stt.type == "HF_API":
105
- raise APIError("STT Error: Streaming not supported for this STT type")
106
- try:
107
- data = ("temp.wav", audio, "audio/wav")
108
- client = OpenAI(base_url=self.config.stt.url, api_key=self.config.stt.key)
109
- transcription = client.audio.transcriptions.create(
110
- model=self.config.stt.name, file=data, response_format="verbose_json", timestamp_granularities=["word"]
111
- )
112
- except APIError:
113
- raise
114
- except Exception as e:
115
- raise APIError(f"STT Error: Unexpected error: {e}")
116
- return transcription.words
117
-
118
- def merge_transcript(self, transcript: Dict, new_transcript: List[Dict[str, str]]) -> Dict:
119
- """
120
- Merge new transcript data with the existing transcript.
121
-
122
- :param transcript: Existing transcript dictionary.
123
- :param new_transcript: New transcript data to merge.
124
- :return: Updated transcript dictionary.
125
- """
126
- cut_off = transcript["last_cutoff"]
127
- transcript["last_cutoff"] = self.MAX_RELIABILITY_CUTOFF - self.STEP_LENGTH
128
-
129
- transcript["words"] = transcript["words"][: len(transcript["words"]) - transcript["not_confirmed"]]
130
- transcript["not_confirmed"] = 0
131
- first_word = True
132
-
133
- for word_dict in new_transcript:
134
- if word_dict["start"] >= cut_off:
135
- if first_word:
136
- if len(transcript["words"]) > 0 and transcript["words"][-1] == word_dict["word"]:
137
- continue
138
- first_word = False
139
- transcript["words"].append(word_dict["word"])
140
- if word_dict["start"] > self.MAX_RELIABILITY_CUTOFF:
141
- transcript["not_confirmed"] += 1
142
- else:
143
- transcript["last_cutoff"] = max(1.0, word_dict["end"] - self.STEP_LENGTH)
144
-
145
- transcript["text"] = " ".join(transcript["words"])
146
- return transcript
147
-
148
  def transcribe_numpy_array(self, audio: np.ndarray, context: Optional[str] = None) -> str:
149
  """
150
  Convert speech to text from a full audio segment.
151
 
152
  :param audio: Tuple containing the sample rate and audio data as numpy array.
 
153
  :return: Transcribed text.
154
  """
155
  audio_bytes = self.numpy_audio_to_bytes(audio)
@@ -183,19 +132,7 @@ class STTManager:
183
  :return: True if the STT service is working, False otherwise.
184
  """
185
  try:
186
- self.speech_to_text_full((48000, np.zeros(10000)))
187
- return True
188
- except:
189
- return False
190
-
191
- def test_streaming(self) -> bool:
192
- """
193
- Test if the STT streaming service is working correctly.
194
-
195
- :return: True if the STT streaming service is working, False otherwise.
196
- """
197
- try:
198
- self.speech_to_text_stream(self.numpy_audio_to_bytes(np.zeros(10000)))
199
  return True
200
  except:
201
  return False
 
41
 
42
  self.config = config
43
  self.status = self.test_stt()
44
+ self.streaming = self.status
45
 
46
  def numpy_audio_to_bytes(self, audio_data: np.ndarray) -> bytes:
47
  """
 
70
 
71
  :param audio: Tuple containing the sample rate and audio data as numpy array.
72
  :param audio_buffer: Current audio buffer as numpy array.
73
+ :return: Updated current audio buffer, audio for transcription
 
74
  """
75
 
76
  has_voice = detect_voice(audio[1])
 
86
 
87
  return np.array([], dtype=np.int16), audio_buffer
88
 
89
+ def transcribe_audio(self, audio: np.ndarray, text: str = "") -> str:
90
  if len(audio) < 500:
91
  return text
92
  else:
93
  transcript = self.transcribe_numpy_array(audio, context=text)
94
  return text + " " + transcript
95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  def transcribe_numpy_array(self, audio: np.ndarray, context: Optional[str] = None) -> str:
97
  """
98
  Convert speech to text from a full audio segment.
99
 
100
  :param audio: Tuple containing the sample rate and audio data as numpy array.
101
+ :param context: Optional context for the transcription.
102
  :return: Transcribed text.
103
  """
104
  audio_bytes = self.numpy_audio_to_bytes(audio)
 
132
  :return: True if the STT service is working, False otherwise.
133
  """
134
  try:
135
+ self.transcribe_audio(np.zeros(10000))
 
 
 
 
 
 
 
 
 
 
 
 
136
  return True
137
  except:
138
  return False