import os import json import tempfile import gradio as gr from google.cloud import speech from microphone import MicrophoneStream from utils import listen_print_loop import pyaudio def list_audio_devices(): audio = pyaudio.PyAudio() for i in range(audio.get_device_count()): device_info = audio.get_device_info_by_index(i) print(f"Device {i}: {device_info['name']}") # process of getting credentials def get_credentials(): creds_json_str = os.getenv("GOOGLE") if creds_json_str is None: raise ValueError("GOOGLE_APPLICATION_CREDENTIALS_JSON not found in environment") # create a temporary file with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".json") as temp: temp.write(creds_json_str) # write in json format temp_filename = temp.name return temp_filename os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = get_credentials() # Audio recording parameters RATE = 16000 CHUNK = int(RATE / 10) # 100ms LANGUAGE = "id-ID" transcribe_client = speech.SpeechClient() config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=RATE, language_code=LANGUAGE, ) streaming_config = speech.StreamingRecognitionConfig( config=config, interim_results=True ) async def main(audio) -> None: print("Streaming started ...") print(list_audio_devices()) with MicrophoneStream(RATE, CHUNK) as stream: audio_generator = stream.generator() requests = ( speech.StreamingRecognizeRequest(audio_content=content) for content in audio_generator ) responses = transcribe_client.streaming_recognize(streaming_config, requests) return await listen_print_loop(responses) demo = gr.Interface( fn=main, inputs=[ gr.Audio(sources="microphone", streaming=True, label="Input Speech") ], outputs=[ gr.Textbox(label="Transcription"), gr.Audio(label="Audio") ], live=True) if __name__ == "__main__": demo.launch()