Spaces:

CineAI
/

Chelsea

Sleeping

File size: 1,335 Bytes

3edbfec
8d3b844
 
4abedda
11c4dd7
 
8d3b844
 
4abedda
 
8d3b844
 
 
c4f6c6c
8d3b844
c4f6c6c
8d3b844
 
 
 
c4f6c6c
8d3b844
 
 
 
 
 
 
4abedda
8d3b844
 
 
c4f6c6c
8d3b844
 
 
 
c4f6c6c
8d3b844

import numpy as np
import librosa
import io

from .init import pipe

TASK = "transcribe"
BATCH_SIZE = 8

class A2T:
    def __init__(self, mic):
        self.mic = mic

    def __generate_text(self, inputs, task: str = None):
        if inputs is  None:
            raise Exception("Inputs is None")

        transcribed_text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
        return transcribed_text

    def __preprocces(self, raw: bytes):
        print(f"Raw type : {type(raw)}")
        chunk = io.BytesIO(raw)
        audio, sample_rate = librosa.load(chunk, sr=16000)
        print(f"Sample rate : {sample_rate}")
        return audio
 
    def predict(self):
        try:
            if self.mic is not None:
                raw = self.mic
                audio = self.__preprocces(raw=raw)
                print(f"audio type : {type(audio)} \n shape : {audio.shape} \n audio max value : {np.max(audio)}")
            else:
                raise Exception("please provide audio")

            if isinstance(audio , np.ndarray):
                return self.__generate_text(inputs=audio, task=TASK)
            else:
                raise Exception("Audio is not np array")
                
        except Exception as e:
            return f"Oops some kinda error : {e}"