File size: 1,608 Bytes
4ee5676 3edbfec e18fb9d a677076 54f1c88 f42d0da 4ee5676 2ba318d a677076 5b8ef5f a677076 e18fb9d 4ee5676 847a572 4ee5676 ab121f4 4ee5676 847a572 a677076 95bbb32 b5e3b33 ab121f4 4ee5676 ab121f4 95bbb32 ab121f4 6d73c34 f42d0da 6d73c34 ab121f4 95bbb32 a677076 ab121f4 2ba318d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
import librosa
import numpy as np
from .init import pipe
TASK = "transcribe"
BATCH_SIZE = 16
LIMIT = 60
class A2T:
def __init__(self, mic):
self.mic = mic
def __transcribe(self, inputs, task: str = None):
if inputs is None:
print("Inputs None")
transcribed_text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task, "language": "english"})["text"]
return transcribed_text
def __preprocces(self, raw: np.ndarray, sampling_rate: int):
chunk = raw.astype(np.float32) / 32768.0
if sampling_rate > 16000:
chunk = librosa.resample(chunk, orig_sr=sampling_rate, target_sr=16000)
chunk = chunk[:16000*LIMIT]
return chunk
def predict(self):
try:
if self.mic is not None:
chunk = self.mic.get_array_of_samples()
chunk = np.array(chunk, dtype=np.int16)
sampling_rate = self.mic.frame_rate
audio = self.__preprocces(raw=chunk, sampling_rate=sampling_rate)
print(f"audio : {audio} \n shape : {audio.shape} \n max : {np.max(audio)}")
else:
raise Exception("please provide audio")
if isinstance(audio , np.ndarray):
# inputs = {"sampling_rate": 16000, "raw": audio}
return self.__transcribe(inputs=audio, task=TASK)
else:
raise Exception("Audio is not np array")
except Exception as e:
return f"Oops some kinda error : {e}"
|