File size: 2,037 Bytes
4ee5676
3edbfec
e18fb9d
a677076
54f1c88
b12e8e8
4ee5676
13a15d8
2ba318d
 
a677076
5b8ef5f
a677076
615c558
e18fb9d
 
 
615c558
4ee5676
847a572
4ee5676
13a15d8
4ee5676
cbd5d1a
4ee5676
13a15d8
 
 
 
 
 
cbd5d1a
13a15d8
 
cbd5d1a
 
4ee5676
847a572
a677076
 
 
95bbb32
13a15d8
 
4ee5676
 
cbd5d1a
95bbb32
ab121f4
6d73c34
 
f42d0da
6d73c34
ab121f4
95bbb32
a677076
ab121f4
2ba318d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import librosa
import numpy as np
from .init import pipe

TASK = "transcribe"
BATCH_SIZE = 8
LIMIT = 60
SAMPLING_RATE = 16000

class A2T:
    def __init__(self, mic):
        self.mic = mic

    def __transcribe(self, inputs, task: str = None, lang: str = "english"):
        if inputs is  None:
            print("Inputs None")

        transcribed_text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task, "language": lang}, return_timestamps=True)["text"]
        return transcribed_text

    def __preprocces(self, raw: np.ndarray, sampling_rate: int):
        chunk = raw.astype(np.float32,  order='C') / 32768.0

        print(f"Chunk : {chunk} max chunk : {np.max(chunk)}")

        if len(chunk.shape) > 1:
            chunk = librosa.to_mono(chunk.T)

        if sampling_rate != SAMPLING_RATE:
            chunk = librosa.resample(chunk, orig_sr=sampling_rate, target_sr=SAMPLING_RATE)

        print(f"Sampling rate : {chunk} max chunk : {np.max(chunk)}")

        chunk = chunk[:SAMPLING_RATE*LIMIT]

        print(f"Chunk cut : {chunk} max chunk : {np.max(chunk)}")
                
        return chunk
        
    def predict(self):
        try:
            if self.mic is not None:
                raw = self.mic.get_array_of_samples()
                chunk = np.array(raw, dtype=np.int16)
                sampling_rate = self.mic.frame_rate
                audio = self.__preprocces(raw=chunk, sampling_rate=sampling_rate)
                print(f"audio : {audio} \n shape : {audio.shape} \n max : {np.max(audio)} \n shape of chunk : {chunk.shape} \n sampling rate : {sampling_rate} \n max chunk : {np.max(chunk)} \n chunk : {chunk}")
            else:
                raise Exception("please provide audio")

            if isinstance(audio , np.ndarray):
                return self.__transcribe(inputs=audio, task=TASK)
            else:
                raise Exception("Audio is not np array")
                
        except Exception as e:
            return f"Oops some kinda error : {e}"