File size: 2,412 Bytes
3edbfec 4abedda a677076 4abedda 2ba318d 4abedda a677076 4abedda e18fb9d 4abedda 847a572 4abedda 6b2a0ad 4abedda 4ee5676 4abedda 4ee5676 4abedda 13a15d8 4abedda 4ee5676 4abedda a677076 4abedda 95bbb32 4abedda 2e5c193 4abedda 2ba318d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
import numpy as np
import speech_recognition as sr
# TASK = "transcribe"
# BATCH_SIZE = 8
# LIMIT = 60
# SAMPLING_RATE = 16000
# class A2T:
# def __init__(self, mic):
# self.mic = mic
# def __transcribe(self, inputs, task: str = None):
# if inputs is None:
# print("Inputs None")
# transcribed_text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
# return transcribed_text
# def __preprocces(self, raw: np.ndarray, sampling_rate: int):
# chunk = raw.astype(np.float32, order='C') / 32768.0
# print(f"Chunk : {chunk} max chunk : {np.max(chunk)}")
# if len(chunk.shape) > 1:
# chunk = librosa.to_mono(chunk.T)
# chunk = chunk[:SAMPLING_RATE*LIMIT]
# return chunk
# def predict(self):
# try:
# if self.mic is not None:
# raw = self.mic.get_array_of_samples()
# chunk = np.array(raw, dtype=np.int16)
# sampling_rate = self.mic.frame_rate
# audio = self.__preprocces(raw=chunk, sampling_rate=sampling_rate)
# print(f"audio : {audio} \n shape : {audio.shape} \n max : {np.max(audio)} \n shape of chunk : {chunk.shape} \n sampling rate : {sampling_rate} \n max chunk : {np.max(chunk)} \n chunk : {chunk}")
# else:
# raise Exception("please provide audio")
# if isinstance(audio , np.ndarray):
# inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
# return self.__transcribe(inputs=inputs, task=TASK)
# else:
# raise Exception("Audio is not np array")
# except Exception as e:
# return f"Oops some kinda error : {e}"
class A2T:
def get_text(self):
# obtain audio from the microphone
r = sr.Recognizer()
with sr.Microphone() as source:
print(source)
audio = r.listen(source)
# recognize speech using Sphinx
try:
return r.recognize_sphinx(audio)
except sr.UnknownValueError:
raise Exception("Sphinx could not understand audio")
except sr.RequestError as e:
raise Exception("Sphinx error; {0}".format(e))
|