Spaces:

CineAI
/

Chelsea

Sleeping

CineAI commited on Mar 27, 2024

Commit

13a15d8

verified ·

1 Parent(s): 615c558

Update audio2text/a2t.py

Files changed (1) hide show

audio2text/a2t.py CHANGED Viewed

@@ -5,6 +5,7 @@ from .init import pipe
 TASK = "transcribe"
 BATCH_SIZE = 8
 LIMIT = 60
 class A2T:
     def __init__(self, mic):
@@ -18,23 +19,30 @@ class A2T:
         return transcribed_text
     def __preprocces(self, raw: np.ndarray, sampling_rate: int):
-        chunk = raw.astype(np.float32) / 32678.0
-        # if sampling_rate != 16000:
-        #     chunk = librosa.resample(chunk, orig_sr=sampling_rate, target_sr=16000)
-        # chunk = chunk[:16000*LIMIT]
         return chunk
     def predict(self):
         try:
             if self.mic is not None:
-                chunk = self.mic.get_array_of_samples()
-                chunk = np.array(chunk, dtype=np.int16)
                 sampling_rate = self.mic.frame_rate
                 audio = self.__preprocces(raw=chunk, sampling_rate=sampling_rate)
-                print(f"audio : {audio} \n shape : {audio.shape} \n max : {np.max(audio)}")
             else:
                 raise Exception("please provide audio")

 TASK = "transcribe"
 BATCH_SIZE = 8
 LIMIT = 60
+SAMPLING_RATE = 16000
 class A2T:
     def __init__(self, mic):
         return transcribed_text
     def __preprocces(self, raw: np.ndarray, sampling_rate: int):
+        chunk = raw.astype(np.float32,  order='C') / 32768.0
+        print(f"Chunk : {chunk}")
+        if len(chunk.shape) > 1:
+            chunk = librosa.to_mono(chunk.T)
+        if sampling_rate != SAMPLING_RATE:
+            chunk = librosa.resample(chunk, orig_sr=sampling_rate, target_sr=SAMPLING_RATE)
+        print(f"Sampling rate : {chunk}")
+        chunk = chunk[:SAMPLING_RATE*LIMIT]
         return chunk
     def predict(self):
         try:
             if self.mic is not None:
+                raw = self.mic.get_array_of_samples()
+                chunk = np.array(raw, dtype=np.int16)
                 sampling_rate = self.mic.frame_rate
                 audio = self.__preprocces(raw=chunk, sampling_rate=sampling_rate)
+                print(f"audio : {audio} \n shape : {audio.shape} \n max : {np.max(audio)} \n shape of chunk : {chunk.shape} \n sampling rate : {sampling_rate} \n raw audio : {raw} \n chunk : {chunk}")
             else:
                 raise Exception("please provide audio")