Spaces:

Baghdad99
/

ha-en

Sleeping

App Files Files Community

Baghdad99 commited on Dec 21, 2023

Commit

1ee8cb6

•

1 Parent(s): fb1d852

Update app.py

Browse files

Files changed (1) hide show

app.py +15 -21

app.py CHANGED Viewed

@@ -1,8 +1,7 @@
 import gradio as gr
 import numpy as np
 from pydub import AudioSegment
-import io
-from transformers import pipeline, AutoTokenizer
 # Load the pipeline for speech recognition and translation
 pipe = pipeline(
@@ -14,32 +13,22 @@ translator = pipeline("text2text-generation", model="Baghdad99/saad-hausa-text-t
 tts = pipeline("text-to-speech", model="Baghdad99/english_voice_tts")
 def translate_speech(audio_data_tuple):
     # Extract the audio data from the tuple
     sample_rate, audio_data = audio_data_tuple
-    # Convert the audio data to int16 format
-    audio_data_int16 = audio_data.astype(np.int16)
-    # Create an AudioSegment from the audio data
-    audio_segment = AudioSegment(
-        audio_data_int16.tobytes(),  # Audio data as bytes
-        frame_rate=sample_rate,
-        sample_width=audio_data_int16.dtype.itemsize,  # Width in bytes
-        channels=1
-    )
-    # Export the AudioSegment as MP3
-    mp3_buffer = io.BytesIO()
-    audio_segment.export(mp3_buffer, format="mp3")
-    # Now you have an MP3 file in a BytesIO buffer. You can write it to a file,
-    # send it over a network, etc. Here's how you can write it to a file:
-    with open("audio.mp3", "wb") as f:
-        f.write(mp3_buffer.getvalue())
-    # Now you can feed the MP3 file to your model
     # Use the speech recognition pipeline to transcribe the audio
-    output = pipe("audio.mp3")
     print(f"Output: {output}")  # Print the output to see what it contains
@@ -91,6 +80,11 @@ def translate_speech(audio_data_tuple):
     return 16000, synthesised_speech
 # Define the Gradio interface
 iface = gr.Interface(
     fn=translate_speech,

 import gradio as gr
+from transformers import pipeline, AutoTokenizer
 import numpy as np
 from pydub import AudioSegment
 # Load the pipeline for speech recognition and translation
 pipe = pipeline(
 tts = pipeline("text-to-speech", model="Baghdad99/english_voice_tts")
 def translate_speech(audio_data_tuple):
+    print(f"Type of audio: {type(audio_data_tuple)}, Value of audio: {audio_data_tuple}")  # Debug line
     # Extract the audio data from the tuple
     sample_rate, audio_data = audio_data_tuple
+    # Print the shape and type of the audio data
+    print(f"Audio data type: {type(audio_data)}, Audio data shape: {audio_data.shape}")
+    # Normalize the audio data to the range [-1, 1]
+    audio_data_normalized = audio_data / np.iinfo(audio_data.dtype).max
+    # Convert the normalized audio data to float64
+    audio_data_float64 = audio_data_normalized.astype(np.float64)
     # Use the speech recognition pipeline to transcribe the audio
+    output = pipe(audio_data_float64)
     print(f"Output: {output}")  # Print the output to see what it contains
     return 16000, synthesised_speech
+# Define the Gradio interface
+iface = gr.Interface(
+    fn=translate_speech,
+    inputs=gr.inputs.Audio(source="microphone"),  # Change this line""
 # Define the Gradio interface
 iface = gr.Interface(
     fn=translate_speech,