|
import os |
|
import json |
|
import tempfile |
|
|
|
import gradio as gr |
|
|
|
from google.cloud import speech |
|
from microphone import MicrophoneStream |
|
from utils import listen_print_loop |
|
|
|
import pyaudio |
|
|
|
def list_audio_devices(): |
|
audio = pyaudio.PyAudio() |
|
for i in range(audio.get_device_count()): |
|
device_info = audio.get_device_info_by_index(i) |
|
print(f"Device {i}: {device_info['name']}") |
|
|
|
|
|
def get_credentials(): |
|
creds_json_str = os.getenv("GOOGLE") |
|
if creds_json_str is None: |
|
raise ValueError("GOOGLE_APPLICATION_CREDENTIALS_JSON not found in environment") |
|
|
|
|
|
with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".json") as temp: |
|
temp.write(creds_json_str) |
|
temp_filename = temp.name |
|
|
|
return temp_filename |
|
|
|
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = get_credentials() |
|
|
|
|
|
RATE = 16000 |
|
CHUNK = int(RATE / 10) |
|
LANGUAGE = "id-ID" |
|
|
|
transcribe_client = speech.SpeechClient() |
|
config = speech.RecognitionConfig( |
|
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, |
|
sample_rate_hertz=RATE, |
|
language_code=LANGUAGE, |
|
) |
|
|
|
streaming_config = speech.StreamingRecognitionConfig( |
|
config=config, interim_results=True |
|
) |
|
|
|
async def main(audio) -> None: |
|
|
|
print("Streaming started ...") |
|
print(list_audio_devices()) |
|
|
|
with MicrophoneStream(RATE, CHUNK) as stream: |
|
audio_generator = stream.generator() |
|
requests = ( |
|
speech.StreamingRecognizeRequest(audio_content=content) |
|
for content in audio_generator |
|
) |
|
|
|
responses = transcribe_client.streaming_recognize(streaming_config, requests) |
|
|
|
return await listen_print_loop(responses) |
|
|
|
demo = gr.Interface( |
|
fn=main, |
|
inputs=[ |
|
gr.Audio(sources="microphone", streaming=True, label="Input Speech") |
|
], |
|
outputs=[ |
|
gr.Textbox(label="Transcription"), |
|
gr.Audio(label="Audio") |
|
], |
|
live=True) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |