import numpy as np import librosa import io TASK = "transcribe" BATCH_SIZE = 8 class A2T: def __init__(self, mic): self.mic = mic def __transcribe(self, inputs, task: str = None): if inputs is None: print("Inputs None") transcribed_text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"] return transcribed_text def __preprocces(self, raw): print(f"Raw type : {type(raw)}") chunk = io.BytesIO(raw) audio, sample_rate = librosa.load(chunk, sr=16000) print(f"Sample rate : {sample_rate}") return audio def predict(self): try: if self.mic is not None: raw = self.mic audio = self.__preprocces(raw=raw) print(f"audio type : {type(audio)} \n shape : {audio.shape}") else: raise Exception("please provide audio") if isinstance(audio , np.ndarray): return self.__transcribe(inputs=inputs, task=TASK) else: raise Exception("Audio is not np array") except Exception as e: return f"Oops some kinda error : {e}"