Spaces:

CineAI
/

Chelsea

Sleeping

File size: 1,608 Bytes

4ee5676
3edbfec
e18fb9d
a677076
54f1c88
f42d0da
4ee5676
2ba318d
 
a677076
5b8ef5f
a677076
e18fb9d
 
 
 
4ee5676
 
847a572
4ee5676
ab121f4
4ee5676
 
 
 
 
 
847a572
a677076
 
 
95bbb32
b5e3b33
ab121f4
4ee5676
 
ab121f4
95bbb32
ab121f4
6d73c34
 
f42d0da
 
6d73c34
ab121f4
95bbb32
a677076
ab121f4
2ba318d

import librosa
import numpy as np
from .init import pipe

TASK = "transcribe"
BATCH_SIZE = 16
LIMIT = 60

class A2T:
    def __init__(self, mic):
        self.mic = mic

    def __transcribe(self, inputs, task: str = None):
        if inputs is  None:
            print("Inputs None")

        transcribed_text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task, "language": "english"})["text"]
        return transcribed_text

    def __preprocces(self, raw: np.ndarray, sampling_rate: int):
        chunk = raw.astype(np.float32) / 32768.0

        if sampling_rate > 16000:
            chunk = librosa.resample(chunk, orig_sr=sampling_rate, target_sr=16000)

        chunk = chunk[:16000*LIMIT]
                
        return chunk
        
    def predict(self):
        try:
            if self.mic is not None:
                chunk = self.mic.get_array_of_samples()
                chunk = np.array(chunk, dtype=np.int16)
                sampling_rate = self.mic.frame_rate
                audio = self.__preprocces(raw=chunk, sampling_rate=sampling_rate)
                print(f"audio : {audio} \n shape : {audio.shape} \n max : {np.max(audio)}")
            else:
                raise Exception("please provide audio")

            if isinstance(audio , np.ndarray):
                # inputs = {"sampling_rate": 16000, "raw": audio}
                return self.__transcribe(inputs=audio, task=TASK)
            else:
                raise Exception("Audio is not np array")
                
        except Exception as e:
            return f"Oops some kinda error : {e}"