voice-ai / app.py
Adipta's picture
Update app.py
5479293 verified
import os
import json
import tempfile
import gradio as gr
from google.cloud import speech
from microphone import MicrophoneStream
from utils import listen_print_loop
import pyaudio
def list_audio_devices():
audio = pyaudio.PyAudio()
for i in range(audio.get_device_count()):
device_info = audio.get_device_info_by_index(i)
print(f"Device {i}: {device_info['name']}")
# process of getting credentials
def get_credentials():
creds_json_str = os.getenv("GOOGLE")
if creds_json_str is None:
raise ValueError("GOOGLE_APPLICATION_CREDENTIALS_JSON not found in environment")
# create a temporary file
with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".json") as temp:
temp.write(creds_json_str) # write in json format
temp_filename = temp.name
return temp_filename
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = get_credentials()
# Audio recording parameters
RATE = 16000
CHUNK = int(RATE / 10) # 100ms
LANGUAGE = "id-ID"
transcribe_client = speech.SpeechClient()
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=RATE,
language_code=LANGUAGE,
)
streaming_config = speech.StreamingRecognitionConfig(
config=config, interim_results=True
)
async def main(audio) -> None:
print("Streaming started ...")
print(list_audio_devices())
with MicrophoneStream(RATE, CHUNK) as stream:
audio_generator = stream.generator()
requests = (
speech.StreamingRecognizeRequest(audio_content=content)
for content in audio_generator
)
responses = transcribe_client.streaming_recognize(streaming_config, requests)
return await listen_print_loop(responses)
demo = gr.Interface(
fn=main,
inputs=[
gr.Audio(sources="microphone", streaming=True, label="Input Speech")
],
outputs=[
gr.Textbox(label="Transcription"),
gr.Audio(label="Audio")
],
live=True)
if __name__ == "__main__":
demo.launch()