File size: 1,830 Bytes
4ee5676
3edbfec
e18fb9d
a677076
54f1c88
b12e8e8
4ee5676
13a15d8
2ba318d
 
a677076
5b8ef5f
a677076
03d44ec
e18fb9d
 
 
03d44ec
4ee5676
847a572
4ee5676
6b2a0ad
13a15d8
4ee5676
cbd5d1a
4ee5676
13a15d8
 
 
 
4ee5676
847a572
a677076
 
 
95bbb32
13a15d8
 
4ee5676
 
cbd5d1a
95bbb32
ab121f4
6d73c34
 
03d44ec
 
6d73c34
ab121f4
95bbb32
a677076
ab121f4
2ba318d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import librosa
import numpy as np
from .init import pipe

TASK = "transcribe"
BATCH_SIZE = 8
LIMIT = 60
SAMPLING_RATE = 16000

class A2T:
    def __init__(self, mic):
        self.mic = mic

    def __transcribe(self, inputs, task: str = None):
        if inputs is  None:
            print("Inputs None")

        transcribed_text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
        return transcribed_text

    def __preprocces(self, raw: np.ndarray, sampling_rate: int):
        
        chunk = raw.astype(np.float32,  order='C') / 32768.0

        print(f"Chunk : {chunk} max chunk : {np.max(chunk)}")

        if len(chunk.shape) > 1:
            chunk = librosa.to_mono(chunk.T)

        chunk = chunk[:SAMPLING_RATE*LIMIT]
                
        return chunk
        
    def predict(self):
        try:
            if self.mic is not None:
                raw = self.mic.get_array_of_samples()
                chunk = np.array(raw, dtype=np.int16)
                sampling_rate = self.mic.frame_rate
                audio = self.__preprocces(raw=chunk, sampling_rate=sampling_rate)
                print(f"audio : {audio} \n shape : {audio.shape} \n max : {np.max(audio)} \n shape of chunk : {chunk.shape} \n sampling rate : {sampling_rate} \n max chunk : {np.max(chunk)} \n chunk : {chunk}")
            else:
                raise Exception("please provide audio")

            if isinstance(audio , np.ndarray):
                inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
                return self.__transcribe(inputs=inputs, task=TASK)
            else:
                raise Exception("Audio is not np array")
                
        except Exception as e:
            return f"Oops some kinda error : {e}"