Spaces:
Sleeping
Sleeping
IliaLarchenko
commited on
Commit
·
6bb887d
1
Parent(s):
da72dc0
Improved STT logic
Browse files- api/audio.py +46 -13
- app.py +1 -0
- requirements.txt +1 -0
- ui/coding.py +34 -31
- utils/ui.py +2 -1
api/audio.py
CHANGED
@@ -8,6 +8,28 @@ from openai import OpenAI
|
|
8 |
|
9 |
from utils.errors import APIError, AudioConversionError
|
10 |
from typing import List, Dict, Optional, Generator, Tuple
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
|
13 |
class STTManager:
|
@@ -42,9 +64,7 @@ class STTManager:
|
|
42 |
raise AudioConversionError(f"Error converting numpy array to audio bytes: {e}")
|
43 |
return buffer.getvalue()
|
44 |
|
45 |
-
def process_audio_chunk(
|
46 |
-
self, audio: Tuple[int, np.ndarray], audio_buffer: np.ndarray, transcript: Dict
|
47 |
-
) -> Tuple[Dict, np.ndarray, str]:
|
48 |
"""
|
49 |
Process streamed audio data to accumulate and transcribe with overlapping segments.
|
50 |
|
@@ -53,15 +73,26 @@ class STTManager:
|
|
53 |
:param transcript: Current transcript dictionary.
|
54 |
:return: Updated transcript, updated audio buffer, and transcript text.
|
55 |
"""
|
56 |
-
audio_buffer = np.concatenate((audio_buffer, audio[1]))
|
57 |
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
|
64 |
-
|
|
|
|
|
|
|
|
|
|
|
65 |
|
66 |
def speech_to_text_stream(self, audio: bytes) -> List[Dict[str, str]]:
|
67 |
"""
|
@@ -114,19 +145,21 @@ class STTManager:
|
|
114 |
transcript["text"] = " ".join(transcript["words"])
|
115 |
return transcript
|
116 |
|
117 |
-
def
|
118 |
"""
|
119 |
Convert speech to text from a full audio segment.
|
120 |
|
121 |
:param audio: Tuple containing the sample rate and audio data as numpy array.
|
122 |
:return: Transcribed text.
|
123 |
"""
|
124 |
-
audio_bytes = self.numpy_audio_to_bytes(audio
|
125 |
try:
|
126 |
if self.config.stt.type == "OPENAI_API":
|
127 |
data = ("temp.wav", audio_bytes, "audio/wav")
|
128 |
client = OpenAI(base_url=self.config.stt.url, api_key=self.config.stt.key)
|
129 |
-
transcription = client.audio.transcriptions.create(
|
|
|
|
|
130 |
elif self.config.stt.type == "HF_API":
|
131 |
headers = {"Authorization": "Bearer " + self.config.stt.key}
|
132 |
response = requests.post(self.config.stt.url, headers=headers, data=audio_bytes)
|
|
|
8 |
|
9 |
from utils.errors import APIError, AudioConversionError
|
10 |
from typing import List, Dict, Optional, Generator, Tuple
|
11 |
+
import webrtcvad
|
12 |
+
|
13 |
+
|
14 |
+
def detect_voice(audio: np.ndarray, sample_rate: int = 48000, frame_duration: int = 30) -> bool:
|
15 |
+
vad = webrtcvad.Vad()
|
16 |
+
vad.set_mode(3) # Aggressiveness mode: 0 (least aggressive) to 3 (most aggressive)
|
17 |
+
|
18 |
+
# Convert numpy array to 16-bit PCM bytes
|
19 |
+
audio_bytes = audio.tobytes()
|
20 |
+
|
21 |
+
num_samples_per_frame = int(sample_rate * frame_duration / 1000)
|
22 |
+
frames = [audio_bytes[i : i + num_samples_per_frame * 2] for i in range(0, len(audio_bytes), num_samples_per_frame * 2)]
|
23 |
+
|
24 |
+
count_speech = 0
|
25 |
+
for frame in frames:
|
26 |
+
if len(frame) < num_samples_per_frame * 2:
|
27 |
+
continue
|
28 |
+
if vad.is_speech(frame, sample_rate):
|
29 |
+
count_speech += 1
|
30 |
+
if count_speech > 6:
|
31 |
+
return True
|
32 |
+
return False
|
33 |
|
34 |
|
35 |
class STTManager:
|
|
|
64 |
raise AudioConversionError(f"Error converting numpy array to audio bytes: {e}")
|
65 |
return buffer.getvalue()
|
66 |
|
67 |
+
def process_audio_chunk(self, audio: Tuple[int, np.ndarray], audio_buffer: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
|
|
|
|
68 |
"""
|
69 |
Process streamed audio data to accumulate and transcribe with overlapping segments.
|
70 |
|
|
|
73 |
:param transcript: Current transcript dictionary.
|
74 |
:return: Updated transcript, updated audio buffer, and transcript text.
|
75 |
"""
|
|
|
76 |
|
77 |
+
has_voice = detect_voice(audio[1])
|
78 |
+
ended = len(audio[1]) % 24000 != 0
|
79 |
+
|
80 |
+
if has_voice:
|
81 |
+
audio_buffer = np.concatenate((audio_buffer, audio[1]))
|
82 |
+
|
83 |
+
is_short = len(audio_buffer) / 48000 < 1.0
|
84 |
+
|
85 |
+
if is_short or (has_voice and not ended):
|
86 |
+
return audio_buffer, np.array([], dtype=np.int16)
|
87 |
+
|
88 |
+
return np.array([], dtype=np.int16), audio_buffer
|
89 |
|
90 |
+
def transcribe_audio(self, audio: np.ndarray, text) -> str:
|
91 |
+
if len(audio) < 500:
|
92 |
+
return text
|
93 |
+
else:
|
94 |
+
transcript = self.transcribe_numpy_array(audio, context=text)
|
95 |
+
return text + " " + transcript
|
96 |
|
97 |
def speech_to_text_stream(self, audio: bytes) -> List[Dict[str, str]]:
|
98 |
"""
|
|
|
145 |
transcript["text"] = " ".join(transcript["words"])
|
146 |
return transcript
|
147 |
|
148 |
+
def transcribe_numpy_array(self, audio: np.ndarray, context: Optional[str] = None) -> str:
|
149 |
"""
|
150 |
Convert speech to text from a full audio segment.
|
151 |
|
152 |
:param audio: Tuple containing the sample rate and audio data as numpy array.
|
153 |
:return: Transcribed text.
|
154 |
"""
|
155 |
+
audio_bytes = self.numpy_audio_to_bytes(audio)
|
156 |
try:
|
157 |
if self.config.stt.type == "OPENAI_API":
|
158 |
data = ("temp.wav", audio_bytes, "audio/wav")
|
159 |
client = OpenAI(base_url=self.config.stt.url, api_key=self.config.stt.key)
|
160 |
+
transcription = client.audio.transcriptions.create(
|
161 |
+
model=self.config.stt.name, file=data, response_format="text", prompt=context
|
162 |
+
)
|
163 |
elif self.config.stt.type == "HF_API":
|
164 |
headers = {"Authorization": "Bearer " + self.config.stt.key}
|
165 |
response = requests.post(self.config.stt.url, headers=headers, data=audio_bytes)
|
app.py
CHANGED
@@ -35,6 +35,7 @@ def main():
|
|
35 |
"""Main function to initialize services and launch the Gradio interface."""
|
36 |
config, llm, tts, stt = initialize_services()
|
37 |
demo = create_interface(llm, tts, stt, default_audio_params)
|
|
|
38 |
demo.launch(show_api=False)
|
39 |
|
40 |
|
|
|
35 |
"""Main function to initialize services and launch the Gradio interface."""
|
36 |
config, llm, tts, stt = initialize_services()
|
37 |
demo = create_interface(llm, tts, stt, default_audio_params)
|
38 |
+
demo.config["dependencies"][0]["show_progress"] = "hidden"
|
39 |
demo.launch(show_api=False)
|
40 |
|
41 |
|
requirements.txt
CHANGED
@@ -2,3 +2,4 @@ gradio==4.29.0
|
|
2 |
openai==1.19.0
|
3 |
python-dotenv==1.0.1
|
4 |
pytest==8.2.0
|
|
|
|
2 |
openai==1.19.0
|
3 |
python-dotenv==1.0.1
|
4 |
pytest==8.2.0
|
5 |
+
webrtcvad=2.0.10
|
ui/coding.py
CHANGED
@@ -3,11 +3,14 @@ import numpy as np
|
|
3 |
import os
|
4 |
|
5 |
from itertools import chain
|
|
|
6 |
|
7 |
from resources.data import fixed_messages, topic_lists
|
8 |
from utils.ui import add_candidate_message, add_interviewer_message
|
9 |
from typing import List, Dict, Generator, Optional, Tuple
|
10 |
from functools import partial
|
|
|
|
|
11 |
|
12 |
|
13 |
def send_request(
|
@@ -15,8 +18,8 @@ def send_request(
|
|
15 |
previous_code: str,
|
16 |
chat_history: List[Dict[str, str]],
|
17 |
chat_display: List[List[Optional[str]]],
|
18 |
-
llm,
|
19 |
-
tts,
|
20 |
silent: Optional[bool] = False,
|
21 |
) -> Generator[Tuple[List[Dict[str, str]], List[List[Optional[str]]], str, bytes], None, None]:
|
22 |
"""
|
@@ -26,14 +29,19 @@ def send_request(
|
|
26 |
if silent is None:
|
27 |
silent = os.getenv("SILENT", False)
|
28 |
|
|
|
|
|
|
|
|
|
29 |
chat_history = llm.update_chat_history(code, previous_code, chat_history, chat_display)
|
30 |
original_len = len(chat_display)
|
31 |
chat_display.append([None, ""])
|
32 |
-
chat_history.append({"role": "assistant", "content": ""})
|
33 |
|
34 |
text_chunks = []
|
35 |
reply = llm.get_text(chat_history)
|
36 |
|
|
|
|
|
37 |
audio_generator = iter(())
|
38 |
has_text_item = True
|
39 |
has_audio_item = not silent
|
@@ -99,7 +107,7 @@ def change_code_area(interview_type):
|
|
99 |
)
|
100 |
|
101 |
|
102 |
-
def get_problem_solving_ui(llm, tts, stt, default_audio_params, audio_output):
|
103 |
send_request_partial = partial(send_request, llm=llm, tts=tts)
|
104 |
|
105 |
with gr.Tab("Interview", render=False, elem_id=f"tab") as problem_tab:
|
@@ -178,20 +186,22 @@ def get_problem_solving_ui(llm, tts, stt, default_audio_params, audio_output):
|
|
178 |
with gr.Column(scale=1):
|
179 |
end_btn = gr.Button("Finish the interview", interactive=False, variant="stop", elem_id=f"end_btn")
|
180 |
chat = gr.Chatbot(label="Chat", show_label=False, show_share_button=False, elem_id=f"chat")
|
|
|
|
|
|
|
181 |
message = gr.Textbox(
|
182 |
label="Message",
|
183 |
show_label=False,
|
184 |
-
lines=
|
185 |
-
max_lines=
|
186 |
-
interactive=
|
187 |
container=False,
|
188 |
elem_id=f"message",
|
189 |
)
|
190 |
-
send_btn = gr.Button("Send", interactive=False, elem_id=f"send_btn")
|
191 |
-
audio_input = gr.Audio(interactive=False, **default_audio_params, elem_id=f"audio_input")
|
192 |
|
|
|
193 |
audio_buffer = gr.State(np.array([], dtype=np.int16))
|
194 |
-
|
195 |
|
196 |
with gr.Accordion("Feedback", open=True, visible=False) as feedback_acc:
|
197 |
feedback = gr.Markdown(elem_id=f"feedback", line_breaks=True)
|
@@ -219,8 +229,8 @@ def get_problem_solving_ui(llm, tts, stt, default_audio_params, audio_output):
|
|
219 |
).success(
|
220 |
fn=llm.init_bot, inputs=[description, interview_type_select], outputs=[chat_history]
|
221 |
).success(
|
222 |
-
fn=lambda: (gr.update(visible=True), gr.update(interactive=True), gr.update(interactive=True)
|
223 |
-
outputs=[solution_acc, end_btn, audio_input
|
224 |
)
|
225 |
|
226 |
end_btn.click(fn=lambda x: add_candidate_message("Let's stop here.", x), inputs=[chat], outputs=[chat]).success(
|
@@ -233,9 +243,8 @@ def get_problem_solving_ui(llm, tts, stt, default_audio_params, audio_output):
|
|
233 |
gr.update(interactive=False),
|
234 |
gr.update(open=False),
|
235 |
gr.update(interactive=False),
|
236 |
-
gr.update(interactive=False),
|
237 |
),
|
238 |
-
outputs=[solution_acc, end_btn, problem_acc, audio_input
|
239 |
).success(
|
240 |
fn=lambda: (gr.update(visible=True)),
|
241 |
outputs=[feedback_acc],
|
@@ -243,32 +252,26 @@ def get_problem_solving_ui(llm, tts, stt, default_audio_params, audio_output):
|
|
243 |
fn=llm.end_interview, inputs=[description, chat_history, interview_type_select], outputs=[feedback]
|
244 |
)
|
245 |
|
246 |
-
|
247 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
248 |
).success(
|
249 |
fn=send_request_partial,
|
250 |
inputs=[code, previous_code, chat_history, chat],
|
251 |
outputs=[chat_history, chat, previous_code, audio_output],
|
252 |
-
# ).success(
|
253 |
-
# fn=tts.read_last_message, inputs=[chat], outputs=[audio_output]
|
254 |
).success(
|
255 |
fn=lambda: np.array([], dtype=np.int16), outputs=[audio_buffer]
|
256 |
).success(
|
257 |
-
|
258 |
)
|
259 |
|
260 |
-
if stt.streaming:
|
261 |
-
audio_input.stream(
|
262 |
-
stt.process_audio_chunk,
|
263 |
-
inputs=[audio_input, audio_buffer, transcript],
|
264 |
-
outputs=[transcript, audio_buffer, message],
|
265 |
-
show_progress="hidden",
|
266 |
-
)
|
267 |
-
else:
|
268 |
-
audio_input.stop_recording(fn=stt.speech_to_text_full, inputs=[audio_input], outputs=[message]).success(
|
269 |
-
fn=lambda: gr.update(interactive=True), outputs=[send_btn]
|
270 |
-
).success(fn=lambda: None, outputs=[audio_input])
|
271 |
-
|
272 |
interview_type_select.change(
|
273 |
fn=lambda x: gr.update(choices=topic_lists[x], value=np.random.choice(topic_lists[x])),
|
274 |
inputs=[interview_type_select],
|
|
|
3 |
import os
|
4 |
|
5 |
from itertools import chain
|
6 |
+
import time
|
7 |
|
8 |
from resources.data import fixed_messages, topic_lists
|
9 |
from utils.ui import add_candidate_message, add_interviewer_message
|
10 |
from typing import List, Dict, Generator, Optional, Tuple
|
11 |
from functools import partial
|
12 |
+
from api.llm import LLMManager
|
13 |
+
from api.audio import TTSManager, STTManager
|
14 |
|
15 |
|
16 |
def send_request(
|
|
|
18 |
previous_code: str,
|
19 |
chat_history: List[Dict[str, str]],
|
20 |
chat_display: List[List[Optional[str]]],
|
21 |
+
llm: LLMManager,
|
22 |
+
tts: Optional[TTSManager],
|
23 |
silent: Optional[bool] = False,
|
24 |
) -> Generator[Tuple[List[Dict[str, str]], List[List[Optional[str]]], str, bytes], None, None]:
|
25 |
"""
|
|
|
29 |
if silent is None:
|
30 |
silent = os.getenv("SILENT", False)
|
31 |
|
32 |
+
if chat_display[-1][0] is None and code == previous_code:
|
33 |
+
yield chat_history, chat_display, code, b""
|
34 |
+
return
|
35 |
+
|
36 |
chat_history = llm.update_chat_history(code, previous_code, chat_history, chat_display)
|
37 |
original_len = len(chat_display)
|
38 |
chat_display.append([None, ""])
|
|
|
39 |
|
40 |
text_chunks = []
|
41 |
reply = llm.get_text(chat_history)
|
42 |
|
43 |
+
chat_history.append({"role": "assistant", "content": ""})
|
44 |
+
|
45 |
audio_generator = iter(())
|
46 |
has_text_item = True
|
47 |
has_audio_item = not silent
|
|
|
107 |
)
|
108 |
|
109 |
|
110 |
+
def get_problem_solving_ui(llm: LLMManager, tts: TTSManager, stt: STTManager, default_audio_params: Dict, audio_output):
|
111 |
send_request_partial = partial(send_request, llm=llm, tts=tts)
|
112 |
|
113 |
with gr.Tab("Interview", render=False, elem_id=f"tab") as problem_tab:
|
|
|
186 |
with gr.Column(scale=1):
|
187 |
end_btn = gr.Button("Finish the interview", interactive=False, variant="stop", elem_id=f"end_btn")
|
188 |
chat = gr.Chatbot(label="Chat", show_label=False, show_share_button=False, elem_id=f"chat")
|
189 |
+
|
190 |
+
# I need this message box only because chat component is flickering when I am updating it
|
191 |
+
# To be improved in the future
|
192 |
message = gr.Textbox(
|
193 |
label="Message",
|
194 |
show_label=False,
|
195 |
+
lines=5,
|
196 |
+
max_lines=5,
|
197 |
+
interactive=False,
|
198 |
container=False,
|
199 |
elem_id=f"message",
|
200 |
)
|
|
|
|
|
201 |
|
202 |
+
audio_input = gr.Audio(interactive=False, **default_audio_params, elem_id=f"audio_input")
|
203 |
audio_buffer = gr.State(np.array([], dtype=np.int16))
|
204 |
+
audio_to_transcribe = gr.State(np.array([], dtype=np.int16))
|
205 |
|
206 |
with gr.Accordion("Feedback", open=True, visible=False) as feedback_acc:
|
207 |
feedback = gr.Markdown(elem_id=f"feedback", line_breaks=True)
|
|
|
229 |
).success(
|
230 |
fn=llm.init_bot, inputs=[description, interview_type_select], outputs=[chat_history]
|
231 |
).success(
|
232 |
+
fn=lambda: (gr.update(visible=True), gr.update(interactive=True), gr.update(interactive=True)),
|
233 |
+
outputs=[solution_acc, end_btn, audio_input],
|
234 |
)
|
235 |
|
236 |
end_btn.click(fn=lambda x: add_candidate_message("Let's stop here.", x), inputs=[chat], outputs=[chat]).success(
|
|
|
243 |
gr.update(interactive=False),
|
244 |
gr.update(open=False),
|
245 |
gr.update(interactive=False),
|
|
|
246 |
),
|
247 |
+
outputs=[solution_acc, end_btn, problem_acc, audio_input],
|
248 |
).success(
|
249 |
fn=lambda: (gr.update(visible=True)),
|
250 |
outputs=[feedback_acc],
|
|
|
252 |
fn=llm.end_interview, inputs=[description, chat_history, interview_type_select], outputs=[feedback]
|
253 |
)
|
254 |
|
255 |
+
audio_input.stream(
|
256 |
+
stt.process_audio_chunk,
|
257 |
+
inputs=[audio_input, audio_buffer],
|
258 |
+
outputs=[audio_buffer, audio_to_transcribe],
|
259 |
+
show_progress="hidden",
|
260 |
+
).success(fn=stt.transcribe_audio, inputs=[audio_to_transcribe, message], outputs=[message], show_progress="hidden")
|
261 |
+
|
262 |
+
# TODO: find a way to remove delay
|
263 |
+
audio_input.stop_recording(fn=lambda: time.sleep(2)).success(
|
264 |
+
fn=add_candidate_message, inputs=[message, chat], outputs=[chat]
|
265 |
).success(
|
266 |
fn=send_request_partial,
|
267 |
inputs=[code, previous_code, chat_history, chat],
|
268 |
outputs=[chat_history, chat, previous_code, audio_output],
|
|
|
|
|
269 |
).success(
|
270 |
fn=lambda: np.array([], dtype=np.int16), outputs=[audio_buffer]
|
271 |
).success(
|
272 |
+
lambda: "", outputs=[message]
|
273 |
)
|
274 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
275 |
interview_type_select.change(
|
276 |
fn=lambda x: gr.update(choices=topic_lists[x], value=np.random.choice(topic_lists[x])),
|
277 |
inputs=[interview_type_select],
|
utils/ui.py
CHANGED
@@ -8,7 +8,8 @@ def add_interviewer_message(message):
|
|
8 |
|
9 |
|
10 |
def add_candidate_message(message, chat):
|
11 |
-
|
|
|
12 |
return chat
|
13 |
|
14 |
|
|
|
8 |
|
9 |
|
10 |
def add_candidate_message(message, chat):
|
11 |
+
if message and len(message) > 0:
|
12 |
+
chat.append((message, None))
|
13 |
return chat
|
14 |
|
15 |
|