File size: 2,412 Bytes
3edbfec
4abedda
a677076
4abedda
 
 
 
2ba318d
4abedda
 
 
a677076
4abedda
 
 
e18fb9d
4abedda
 
847a572
4abedda
6b2a0ad
4abedda
4ee5676
4abedda
4ee5676
4abedda
 
13a15d8
4abedda
4ee5676
4abedda
a677076
4abedda
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95bbb32
4abedda
 
 
 
 
 
 
 
 
2e5c193
4abedda
 
 
 
 
 
 
 
 
2ba318d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import numpy as np
import speech_recognition as sr

# TASK = "transcribe"
# BATCH_SIZE = 8
# LIMIT = 60
# SAMPLING_RATE = 16000

# class A2T:
#     def __init__(self, mic):
#         self.mic = mic

#     def __transcribe(self, inputs, task: str = None):
#         if inputs is  None:
#             print("Inputs None")

#         transcribed_text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
#         return transcribed_text

#     def __preprocces(self, raw: np.ndarray, sampling_rate: int):
        
#         chunk = raw.astype(np.float32,  order='C') / 32768.0

#         print(f"Chunk : {chunk} max chunk : {np.max(chunk)}")

#         if len(chunk.shape) > 1:
#             chunk = librosa.to_mono(chunk.T)

#         chunk = chunk[:SAMPLING_RATE*LIMIT]
                
#         return chunk
        
#     def predict(self):
#         try:
#             if self.mic is not None:
#                 raw = self.mic.get_array_of_samples()
#                 chunk = np.array(raw, dtype=np.int16)
#                 sampling_rate = self.mic.frame_rate
#                 audio = self.__preprocces(raw=chunk, sampling_rate=sampling_rate)
#                 print(f"audio : {audio} \n shape : {audio.shape} \n max : {np.max(audio)} \n shape of chunk : {chunk.shape} \n sampling rate : {sampling_rate} \n max chunk : {np.max(chunk)} \n chunk : {chunk}")
#             else:
#                 raise Exception("please provide audio")

#             if isinstance(audio , np.ndarray):
#                 inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
#                 return self.__transcribe(inputs=inputs, task=TASK)
#             else:
#                 raise Exception("Audio is not np array")
                
#         except Exception as e:
#             return f"Oops some kinda error : {e}"


class A2T:
    def get_text(self):
        # obtain audio from the microphone
        r = sr.Recognizer()
        with sr.Microphone() as source:
            print(source)
            audio = r.listen(source)
        
        # recognize speech using Sphinx
        try:
            return r.recognize_sphinx(audio)
        except sr.UnknownValueError:
            raise Exception("Sphinx could not understand audio")
        except sr.RequestError as e:
            raise Exception("Sphinx error; {0}".format(e))