File size: 2,086 Bytes
b8c0af1
 
 
 
27c3220
 
 
 
 
 
5479293
 
 
 
 
 
 
 
b8c0af1
 
 
 
 
 
 
 
 
 
 
 
 
 
27c3220
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5479293
27c3220
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import os
import json
import tempfile

import gradio as gr

from google.cloud import speech
from microphone import MicrophoneStream
from utils import listen_print_loop

import pyaudio

def list_audio_devices():
    audio = pyaudio.PyAudio()
    for i in range(audio.get_device_count()):
        device_info = audio.get_device_info_by_index(i)
        print(f"Device {i}: {device_info['name']}")

# process of getting credentials
def get_credentials():
    creds_json_str = os.getenv("GOOGLE")
    if creds_json_str is None:
        raise ValueError("GOOGLE_APPLICATION_CREDENTIALS_JSON not found in environment")

    # create a temporary file
    with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".json") as temp:
        temp.write(creds_json_str) # write in json format
        temp_filename = temp.name 

    return temp_filename
    
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = get_credentials()

# Audio recording parameters
RATE = 16000
CHUNK = int(RATE / 10)  # 100ms
LANGUAGE = "id-ID"

transcribe_client = speech.SpeechClient()
config = speech.RecognitionConfig(
    encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
    sample_rate_hertz=RATE,
    language_code=LANGUAGE,
)

streaming_config = speech.StreamingRecognitionConfig(
    config=config, interim_results=True
)
    
async def main(audio) -> None:
    
    print("Streaming started ...")
    print(list_audio_devices())

    with MicrophoneStream(RATE, CHUNK) as stream:
        audio_generator = stream.generator()
        requests = (
            speech.StreamingRecognizeRequest(audio_content=content)
            for content in audio_generator
        )

        responses = transcribe_client.streaming_recognize(streaming_config, requests)

        return await listen_print_loop(responses)
    
demo = gr.Interface(
    fn=main, 
    inputs=[
        gr.Audio(sources="microphone", streaming=True, label="Input Speech")
    ],
    outputs=[
        gr.Textbox(label="Transcription"),
        gr.Audio(label="Audio")
    ],
    live=True)

if __name__ == "__main__":
    demo.launch()