Spaces:
Sleeping
Sleeping
IliaLarchenko
commited on
Commit
•
3a5dbe6
1
Parent(s):
43d5e00
Added STT streaming
Browse files- api/audio.py +86 -24
- app.py +34 -5
api/audio.py
CHANGED
@@ -10,31 +10,87 @@ from openai import OpenAI
|
|
10 |
from utils.errors import APIError, AudioConversionError
|
11 |
|
12 |
|
13 |
-
def numpy_audio_to_bytes(audio_data):
|
14 |
-
sample_rate = 44100
|
15 |
-
num_channels = 1
|
16 |
-
sampwidth = 2
|
17 |
-
|
18 |
-
buffer = io.BytesIO()
|
19 |
-
try:
|
20 |
-
with wave.open(buffer, "wb") as wf:
|
21 |
-
wf.setnchannels(num_channels)
|
22 |
-
wf.setsampwidth(sampwidth)
|
23 |
-
wf.setframerate(sample_rate)
|
24 |
-
wf.writeframes(audio_data.tobytes())
|
25 |
-
except Exception as e:
|
26 |
-
raise AudioConversionError(f"Error converting numpy array to audio bytes: {e}")
|
27 |
-
return buffer.getvalue()
|
28 |
-
|
29 |
-
|
30 |
class STTManager:
|
31 |
def __init__(self, config):
|
|
|
|
|
|
|
|
|
|
|
32 |
self.config = config
|
33 |
self.status = self.test_stt()
|
34 |
-
self.streaming =
|
|
|
|
|
|
|
|
|
35 |
|
36 |
-
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
try:
|
39 |
if self.config.stt.type == "OPENAI_API":
|
40 |
data = ("temp.wav", audio, "audio/wav")
|
@@ -58,14 +114,20 @@ class STTManager:
|
|
58 |
|
59 |
def test_stt(self):
|
60 |
try:
|
61 |
-
self.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
return True
|
63 |
except:
|
64 |
return False
|
65 |
|
66 |
-
def add_user_message(self,
|
67 |
-
|
68 |
-
chat_display.append([transcription, None])
|
69 |
return chat_display
|
70 |
|
71 |
|
|
|
10 |
from utils.errors import APIError, AudioConversionError
|
11 |
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
class STTManager:
|
14 |
def __init__(self, config):
|
15 |
+
self.SAMPLE_RATE = 48000
|
16 |
+
self.CHUNK_LENGTH = 5
|
17 |
+
self.STEP_LENGTH = 3
|
18 |
+
self.MAX_RELIABILITY_CUTOFF = self.CHUNK_LENGTH - 1
|
19 |
+
|
20 |
self.config = config
|
21 |
self.status = self.test_stt()
|
22 |
+
self.streaming = self.test_streaming()
|
23 |
+
|
24 |
+
def numpy_audio_to_bytes(self, audio_data):
|
25 |
+
num_channels = 1
|
26 |
+
sampwidth = 2
|
27 |
|
28 |
+
buffer = io.BytesIO()
|
29 |
+
try:
|
30 |
+
with wave.open(buffer, "wb") as wf:
|
31 |
+
wf.setnchannels(num_channels)
|
32 |
+
wf.setsampwidth(sampwidth)
|
33 |
+
wf.setframerate(self.SAMPLE_RATE)
|
34 |
+
wf.writeframes(audio_data.tobytes())
|
35 |
+
except Exception as e:
|
36 |
+
raise AudioConversionError(f"Error converting numpy array to audio bytes: {e}")
|
37 |
+
return buffer.getvalue()
|
38 |
+
|
39 |
+
def process_audio_chunk(self, audio, audio_buffer, transcript):
|
40 |
+
"""Process streamed audio data to accumulate and transcribe with overlapping segments."""
|
41 |
+
audio_buffer = np.concatenate((audio_buffer, audio[1]))
|
42 |
+
|
43 |
+
if len(audio_buffer) >= self.SAMPLE_RATE * self.CHUNK_LENGTH or len(audio_buffer) % (self.SAMPLE_RATE // 2) != 0:
|
44 |
+
audio_bytes = self.numpy_audio_to_bytes(audio_buffer[: self.SAMPLE_RATE * self.CHUNK_LENGTH])
|
45 |
+
audio_buffer = audio_buffer[self.SAMPLE_RATE * self.STEP_LENGTH :]
|
46 |
+
|
47 |
+
new_transcript = self.speech_to_text_stream(audio_bytes)
|
48 |
+
transcript = self.merge_transcript(transcript, new_transcript)
|
49 |
+
|
50 |
+
return transcript, audio_buffer, transcript["text"]
|
51 |
+
|
52 |
+
def speech_to_text_stream(self, audio):
|
53 |
+
if self.config.stt.type == "HF_API":
|
54 |
+
raise APIError("STT Error: Streaming not supported for this STT type")
|
55 |
+
try:
|
56 |
+
data = ("temp.wav", audio, "audio/wav")
|
57 |
+
client = OpenAI(base_url=self.config.stt.url, api_key=self.config.stt.key)
|
58 |
+
transcription = client.audio.transcriptions.create(
|
59 |
+
model=self.config.stt.name, file=data, response_format="verbose_json", timestamp_granularities=["word"]
|
60 |
+
)
|
61 |
+
except APIError as e:
|
62 |
+
raise
|
63 |
+
except Exception as e:
|
64 |
+
raise APIError(f"STT Error: Unexpected error: {e}")
|
65 |
+
return transcription.words
|
66 |
+
|
67 |
+
def merge_transcript(self, transcript, new_transcript):
|
68 |
+
cut_off = transcript["last_cutoff"]
|
69 |
+
transcript["last_cutoff"] = self.MAX_RELIABILITY_CUTOFF - self.STEP_LENGTH
|
70 |
+
|
71 |
+
transcript["words"] = transcript["words"][: len(transcript["words"]) - transcript["not_confirmed"]]
|
72 |
+
|
73 |
+
transcript["not_confirmed"] = 0
|
74 |
+
first_word = True
|
75 |
+
|
76 |
+
for word_dict in new_transcript:
|
77 |
+
if word_dict["start"] >= cut_off:
|
78 |
+
if first_word:
|
79 |
+
if len(transcript["words"]) > 0 and transcript["words"][-1] == word_dict["word"]:
|
80 |
+
continue
|
81 |
+
first_word = False
|
82 |
+
transcript["words"].append(word_dict["word"])
|
83 |
+
if word_dict["start"] > self.MAX_RELIABILITY_CUTOFF:
|
84 |
+
transcript["not_confirmed"] += 1
|
85 |
+
else:
|
86 |
+
transcript["last_cutoff"] = max(1.0, word_dict["end"] - self.STEP_LENGTH)
|
87 |
+
|
88 |
+
transcript["text"] = " ".join(transcript["words"])
|
89 |
+
|
90 |
+
return transcript
|
91 |
+
|
92 |
+
def speech_to_text_full(self, audio):
|
93 |
+
audio = self.numpy_audio_to_bytes(audio[1])
|
94 |
try:
|
95 |
if self.config.stt.type == "OPENAI_API":
|
96 |
data = ("temp.wav", audio, "audio/wav")
|
|
|
114 |
|
115 |
def test_stt(self):
|
116 |
try:
|
117 |
+
self.speech_to_text_full((48000, np.zeros(10000)))
|
118 |
+
return True
|
119 |
+
except:
|
120 |
+
return False
|
121 |
+
|
122 |
+
def test_streaming(self):
|
123 |
+
try:
|
124 |
+
self.speech_to_text_stream(self.numpy_audio_to_bytes(np.zeros(10000)))
|
125 |
return True
|
126 |
except:
|
127 |
return False
|
128 |
|
129 |
+
def add_user_message(self, message, chat_display):
|
130 |
+
chat_display.append([message, None])
|
|
|
131 |
return chat_display
|
132 |
|
133 |
|
app.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import os
|
2 |
|
3 |
import gradio as gr
|
|
|
4 |
|
5 |
from api.audio import STTManager, TTSManager
|
6 |
from api.llm import LLMManager
|
@@ -22,6 +23,7 @@ default_audio_params = {
|
|
22 |
"editable": False,
|
23 |
"container": False,
|
24 |
"show_share_button": False,
|
|
|
25 |
}
|
26 |
|
27 |
|
@@ -125,13 +127,25 @@ with gr.Blocks(title="AI Interviewer") as demo:
|
|
125 |
code = gr.Code(
|
126 |
label="Please write your code here. You can use any language, but only Python syntax highlighting is available.",
|
127 |
language="python",
|
128 |
-
lines=
|
129 |
)
|
130 |
with gr.Column(scale=1):
|
131 |
end_btn = gr.Button("Finish the interview", interactive=False)
|
132 |
chat = gr.Chatbot(label="Chat", show_label=False, show_share_button=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
audio_input = gr.Audio(interactive=False, **default_audio_params)
|
134 |
|
|
|
|
|
|
|
135 |
with gr.Accordion("Feedback", open=True) as feedback_acc:
|
136 |
feedback = gr.Markdown()
|
137 |
|
@@ -165,14 +179,29 @@ with gr.Blocks(title="AI Interviewer") as demo:
|
|
165 |
fn=llm.end_interview, inputs=[description, chat_history], outputs=[feedback]
|
166 |
)
|
167 |
|
168 |
-
|
169 |
-
fn=lambda: None, outputs=[audio_input]
|
170 |
-
).success(
|
171 |
fn=llm.send_request,
|
172 |
inputs=[code, previous_code, chat_history, chat],
|
173 |
outputs=[chat_history, chat, previous_code],
|
|
|
|
|
174 |
).success(
|
175 |
-
fn=
|
|
|
|
|
176 |
)
|
177 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
178 |
demo.launch(show_api=False)
|
|
|
1 |
import os
|
2 |
|
3 |
import gradio as gr
|
4 |
+
import numpy as np
|
5 |
|
6 |
from api.audio import STTManager, TTSManager
|
7 |
from api.llm import LLMManager
|
|
|
23 |
"editable": False,
|
24 |
"container": False,
|
25 |
"show_share_button": False,
|
26 |
+
"streaming": stt.streaming,
|
27 |
}
|
28 |
|
29 |
|
|
|
127 |
code = gr.Code(
|
128 |
label="Please write your code here. You can use any language, but only Python syntax highlighting is available.",
|
129 |
language="python",
|
130 |
+
lines=46,
|
131 |
)
|
132 |
with gr.Column(scale=1):
|
133 |
end_btn = gr.Button("Finish the interview", interactive=False)
|
134 |
chat = gr.Chatbot(label="Chat", show_label=False, show_share_button=False)
|
135 |
+
message = gr.Textbox(
|
136 |
+
label="Message",
|
137 |
+
placeholder="Your message will appear here",
|
138 |
+
show_label=False,
|
139 |
+
lines=3,
|
140 |
+
max_lines=3,
|
141 |
+
interactive=False,
|
142 |
+
)
|
143 |
+
send_btn = gr.Button("Send", interactive=False)
|
144 |
audio_input = gr.Audio(interactive=False, **default_audio_params)
|
145 |
|
146 |
+
audio_buffer = gr.State(np.array([], dtype=np.int16))
|
147 |
+
transcript = gr.State({"words": [], "not_confirmed": 0, "last_cutoff": 0, "text": ""})
|
148 |
+
|
149 |
with gr.Accordion("Feedback", open=True) as feedback_acc:
|
150 |
feedback = gr.Markdown()
|
151 |
|
|
|
179 |
fn=llm.end_interview, inputs=[description, chat_history], outputs=[feedback]
|
180 |
)
|
181 |
|
182 |
+
send_btn.click(fn=stt.add_user_message, inputs=[message, chat], outputs=[chat]).success(fn=lambda: None, outputs=[message]).success(
|
|
|
|
|
183 |
fn=llm.send_request,
|
184 |
inputs=[code, previous_code, chat_history, chat],
|
185 |
outputs=[chat_history, chat, previous_code],
|
186 |
+
).success(fn=tts.read_last_message, inputs=[chat], outputs=[audio_output]).success(
|
187 |
+
fn=lambda: gr.Button("Send", interactive=False), outputs=[send_btn]
|
188 |
).success(
|
189 |
+
fn=lambda: np.array([], dtype=np.int16), outputs=[audio_buffer]
|
190 |
+
).success(
|
191 |
+
fn=lambda: {"words": [], "not_confirmed": 0, "last_cutoff": 0, "text": ""}, outputs=[transcript]
|
192 |
)
|
193 |
|
194 |
+
if stt.streaming:
|
195 |
+
audio_input.stream(
|
196 |
+
stt.process_audio_chunk,
|
197 |
+
inputs=[audio_input, audio_buffer, transcript],
|
198 |
+
outputs=[transcript, audio_buffer, message],
|
199 |
+
show_progress="hidden",
|
200 |
+
)
|
201 |
+
audio_input.stop_recording(fn=lambda: gr.Button("Send", interactive=True), outputs=[send_btn])
|
202 |
+
else:
|
203 |
+
audio_input.stop_recording(fn=stt.speech_to_text_full, inputs=[audio_input], outputs=[message]).success(
|
204 |
+
fn=lambda: gr.Button("Send", interactive=True), outputs=[send_btn]
|
205 |
+
).success(fn=lambda: None, outputs=[audio_input])
|
206 |
+
|
207 |
demo.launch(show_api=False)
|