Baghdad99 commited on
Commit
1ee8cb6
1 Parent(s): fb1d852

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -21
app.py CHANGED
@@ -1,8 +1,7 @@
1
  import gradio as gr
 
2
  import numpy as np
3
  from pydub import AudioSegment
4
- import io
5
- from transformers import pipeline, AutoTokenizer
6
 
7
  # Load the pipeline for speech recognition and translation
8
  pipe = pipeline(
@@ -14,32 +13,22 @@ translator = pipeline("text2text-generation", model="Baghdad99/saad-hausa-text-t
14
  tts = pipeline("text-to-speech", model="Baghdad99/english_voice_tts")
15
 
16
  def translate_speech(audio_data_tuple):
 
 
17
  # Extract the audio data from the tuple
18
  sample_rate, audio_data = audio_data_tuple
19
 
20
- # Convert the audio data to int16 format
21
- audio_data_int16 = audio_data.astype(np.int16)
22
-
23
- # Create an AudioSegment from the audio data
24
- audio_segment = AudioSegment(
25
- audio_data_int16.tobytes(), # Audio data as bytes
26
- frame_rate=sample_rate,
27
- sample_width=audio_data_int16.dtype.itemsize, # Width in bytes
28
- channels=1
29
- )
30
 
31
- # Export the AudioSegment as MP3
32
- mp3_buffer = io.BytesIO()
33
- audio_segment.export(mp3_buffer, format="mp3")
34
 
35
- # Now you have an MP3 file in a BytesIO buffer. You can write it to a file,
36
- # send it over a network, etc. Here's how you can write it to a file:
37
- with open("audio.mp3", "wb") as f:
38
- f.write(mp3_buffer.getvalue())
39
 
40
- # Now you can feed the MP3 file to your model
41
  # Use the speech recognition pipeline to transcribe the audio
42
- output = pipe("audio.mp3")
43
 
44
  print(f"Output: {output}") # Print the output to see what it contains
45
 
@@ -91,6 +80,11 @@ def translate_speech(audio_data_tuple):
91
  return 16000, synthesised_speech
92
 
93
 
 
 
 
 
 
94
  # Define the Gradio interface
95
  iface = gr.Interface(
96
  fn=translate_speech,
 
1
  import gradio as gr
2
+ from transformers import pipeline, AutoTokenizer
3
  import numpy as np
4
  from pydub import AudioSegment
 
 
5
 
6
  # Load the pipeline for speech recognition and translation
7
  pipe = pipeline(
 
13
  tts = pipeline("text-to-speech", model="Baghdad99/english_voice_tts")
14
 
15
  def translate_speech(audio_data_tuple):
16
+ print(f"Type of audio: {type(audio_data_tuple)}, Value of audio: {audio_data_tuple}") # Debug line
17
+
18
  # Extract the audio data from the tuple
19
  sample_rate, audio_data = audio_data_tuple
20
 
21
+ # Print the shape and type of the audio data
22
+ print(f"Audio data type: {type(audio_data)}, Audio data shape: {audio_data.shape}")
 
 
 
 
 
 
 
 
23
 
24
+ # Normalize the audio data to the range [-1, 1]
25
+ audio_data_normalized = audio_data / np.iinfo(audio_data.dtype).max
 
26
 
27
+ # Convert the normalized audio data to float64
28
+ audio_data_float64 = audio_data_normalized.astype(np.float64)
 
 
29
 
 
30
  # Use the speech recognition pipeline to transcribe the audio
31
+ output = pipe(audio_data_float64)
32
 
33
  print(f"Output: {output}") # Print the output to see what it contains
34
 
 
80
  return 16000, synthesised_speech
81
 
82
 
83
+ # Define the Gradio interface
84
+ iface = gr.Interface(
85
+ fn=translate_speech,
86
+ inputs=gr.inputs.Audio(source="microphone"), # Change this line""
87
+
88
  # Define the Gradio interface
89
  iface = gr.Interface(
90
  fn=translate_speech,