import numpy as np | |
import librosa | |
import io | |
from .config import pipe | |
TASK = "transcribe" | |
BATCH_SIZE = 8 | |
class A2T: | |
def __init__(self, mic): | |
self.mic = mic | |
def __generate_text(self, inputs, task: str = None): | |
if inputs is None: | |
raise Exception("Inputs is None") | |
transcribed_text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"] | |
return transcribed_text | |
def __preprocces(self, raw: bytes): | |
print(f"Raw type : {type(raw)}") | |
chunk = io.BytesIO(raw) | |
audio, sample_rate = librosa.load(chunk, sr=16000) | |
print(f"Sample rate : {sample_rate}") | |
return audio | |
def predict(self): | |
try: | |
if self.mic is not None: | |
raw = self.mic | |
audio = self.__preprocces(raw=raw) | |
print(f"audio type : {type(audio)} \n shape : {audio.shape} \n audio max value : {np.max(audio)}") | |
else: | |
raise Exception("please provide audio") | |
if isinstance(audio , np.ndarray): | |
return self.__generate_text(inputs=audio, task=TASK) | |
else: | |
raise Exception("Audio is not np array") | |
except Exception as e: | |
return f"Oops some kinda error : {e}" |