CineAI commited on
Commit
8d3b844
1 Parent(s): 78714c6

Update audio2text/a2t.py

Browse files
Files changed (1) hide show
  1. audio2text/a2t.py +36 -64
audio2text/a2t.py CHANGED
@@ -1,69 +1,41 @@
1
  import numpy as np
2
- import speech_recognition as sr
3
-
4
- # TASK = "transcribe"
5
- # BATCH_SIZE = 8
6
- # LIMIT = 60
7
- # SAMPLING_RATE = 16000
8
-
9
- # class A2T:
10
- # def __init__(self, mic):
11
- # self.mic = mic
12
-
13
- # def __transcribe(self, inputs, task: str = None):
14
- # if inputs is None:
15
- # print("Inputs None")
16
-
17
- # transcribed_text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
18
- # return transcribed_text
19
-
20
- # def __preprocces(self, raw: np.ndarray, sampling_rate: int):
21
-
22
- # chunk = raw.astype(np.float32, order='C') / 32768.0
23
-
24
- # print(f"Chunk : {chunk} max chunk : {np.max(chunk)}")
25
-
26
- # if len(chunk.shape) > 1:
27
- # chunk = librosa.to_mono(chunk.T)
28
-
29
- # chunk = chunk[:SAMPLING_RATE*LIMIT]
30
-
31
- # return chunk
32
-
33
- # def predict(self):
34
- # try:
35
- # if self.mic is not None:
36
- # raw = self.mic.get_array_of_samples()
37
- # chunk = np.array(raw, dtype=np.int16)
38
- # sampling_rate = self.mic.frame_rate
39
- # audio = self.__preprocces(raw=chunk, sampling_rate=sampling_rate)
40
- # print(f"audio : {audio} \n shape : {audio.shape} \n max : {np.max(audio)} \n shape of chunk : {chunk.shape} \n sampling rate : {sampling_rate} \n max chunk : {np.max(chunk)} \n chunk : {chunk}")
41
- # else:
42
- # raise Exception("please provide audio")
43
-
44
- # if isinstance(audio , np.ndarray):
45
- # inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
46
- # return self.__transcribe(inputs=inputs, task=TASK)
47
- # else:
48
- # raise Exception("Audio is not np array")
49
-
50
- # except Exception as e:
51
- # return f"Oops some kinda error : {e}"
52
 
 
 
53
 
54
  class A2T:
55
- def get_text(self):
56
- # obtain audio from the microphone
57
- r = sr.Recognizer()
58
- with sr.Microphone() as source:
59
- print(source)
60
- audio = r.listen(source)
61
-
62
- # recognize speech using Sphinx
 
 
 
 
 
 
 
 
 
 
63
  try:
64
- return r.recognize_sphinx(audio)
65
- except sr.UnknownValueError:
66
- raise Exception("Sphinx could not understand audio")
67
- except sr.RequestError as e:
68
- raise Exception("Sphinx error; {0}".format(e))
69
-
 
 
 
 
 
 
 
 
 
1
  import numpy as np
2
+ import librosa
3
+ import io
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
+ TASK = "transcribe"
6
+ BATCH_SIZE = 8
7
 
8
  class A2T:
9
+ def __init__(self, mic):
10
+ self.mic = mic
11
+
12
+ def __transcribe(self, inputs, task: str = None):
13
+ if inputs is None:
14
+ print("Inputs None")
15
+
16
+ transcribed_text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
17
+ return transcribed_text
18
+
19
+ def __preprocces(self, raw):
20
+ print(f"Raw type : {type(raw)}")
21
+ chunk = io.BytesIO(raw)
22
+ audio, sample_rate = librosa.load(chunk, sr=16000)
23
+ print(f"Sample rate : {sample_rate}")
24
+ return audio
25
+
26
+ def predict(self):
27
  try:
28
+ if self.mic is not None:
29
+ raw = self.mic
30
+ audio = self.__preprocces(raw=raw)
31
+ print(f"audio type : {type(audio)} \n shape : {audio.shape}")
32
+ else:
33
+ raise Exception("please provide audio")
34
+
35
+ if isinstance(audio , np.ndarray):
36
+ return self.__transcribe(inputs=inputs, task=TASK)
37
+ else:
38
+ raise Exception("Audio is not np array")
39
+
40
+ except Exception as e:
41
+ return f"Oops some kinda error : {e}"