Spaces:

Baghdad99
/

ha-en

Sleeping

Baghdad99 commited on Dec 21, 2023

Commit

fb1d852

•

1 Parent(s): 47453aa

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import gradio as gr
-from transformers import pipeline, AutoTokenizer
 import numpy as np
 from pydub import AudioSegment
 # Load the pipeline for speech recognition and translation
 pipe = pipeline(
@@ -13,22 +14,32 @@ translator = pipeline("text2text-generation", model="Baghdad99/saad-hausa-text-t
 tts = pipeline("text-to-speech", model="Baghdad99/english_voice_tts")
 def translate_speech(audio_data_tuple):
-    print(f"Type of audio: {type(audio_data_tuple)}, Value of audio: {audio_data_tuple}")  # Debug line
     # Extract the audio data from the tuple
     sample_rate, audio_data = audio_data_tuple
-    # Print the shape and type of the audio data
-    print(f"Audio data type: {type(audio_data)}, Audio data shape: {audio_data.shape}")
-    # Normalize the audio data to the range [-1, 1]
-    audio_data_normalized = audio_data / np.iinfo(audio_data.dtype).max
-    # Convert the normalized audio data to float64
-    audio_data_float64 = audio_data_normalized.astype(np.float64)
     # Use the speech recognition pipeline to transcribe the audio
-    output = pipe(audio_data_float64)
     print(f"Output: {output}")  # Print the output to see what it contains

 import gradio as gr
 import numpy as np
 from pydub import AudioSegment
+import io
+from transformers import pipeline, AutoTokenizer
 # Load the pipeline for speech recognition and translation
 pipe = pipeline(
 tts = pipeline("text-to-speech", model="Baghdad99/english_voice_tts")
 def translate_speech(audio_data_tuple):
     # Extract the audio data from the tuple
     sample_rate, audio_data = audio_data_tuple
+    # Convert the audio data to int16 format
+    audio_data_int16 = audio_data.astype(np.int16)
+    # Create an AudioSegment from the audio data
+    audio_segment = AudioSegment(
+        audio_data_int16.tobytes(),  # Audio data as bytes
+        frame_rate=sample_rate,
+        sample_width=audio_data_int16.dtype.itemsize,  # Width in bytes
+        channels=1
+    )
+    # Export the AudioSegment as MP3
+    mp3_buffer = io.BytesIO()
+    audio_segment.export(mp3_buffer, format="mp3")
+    # Now you have an MP3 file in a BytesIO buffer. You can write it to a file,
+    # send it over a network, etc. Here's how you can write it to a file:
+    with open("audio.mp3", "wb") as f:
+        f.write(mp3_buffer.getvalue())
+    # Now you can feed the MP3 file to your model
     # Use the speech recognition pipeline to transcribe the audio
+    output = pipe("audio.mp3")
     print(f"Output: {output}")  # Print the output to see what it contains