File size: 2,297 Bytes
5b74a4b
fcc244c
83e3ccb
22cb79a
9829b9c
5b74a4b
fcc244c
cca146f
fcc244c
 
 
425531b
fcc244c
cca146f
 
 
 
 
fcc244c
5b74a4b
fcc244c
4e03a52
 
9829b9c
cca146f
 
 
 
 
5b74a4b
72632b9
fcc244c
c58bd88
fcc244c
 
 
17cfe18
22cb79a
9829b9c
22cb79a
 
 
 
 
83e3ccb
 
a5ec736
b2c7d3a
5b74a4b
 
4e03a52
8fe6fd5
5b74a4b
 
 
 
b2c7d3a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import gradio as gr
import requests
import numpy as np
from pydub import AudioSegment
import io

# Define the Hugging Face Inference API URLs and headers
ASR_API_URL = "https://api-inference.huggingface.co/models/Baghdad99/saad-speech-recognition-hausa-audio-to-text"
TTS_API_URL = "https://api-inference.huggingface.co/models/Baghdad99/english_voice_tts"
TRANSLATION_API_URL = "https://api-inference.huggingface.co/models/Baghdad99/saad-hausa-text-to-english-text"
headers = {"Authorization": "Bearer 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"}

# Define the function to query the Hugging Face Inference API
def query(api_url, payload=None, data=None):
    if data is not None:
        response = requests.post(api_url, headers=headers, data=data)
    else:
        response = requests.post(api_url, headers=headers, json=payload)
    return response.json()

# Define the function to translate speech
def translate_speech(audio_file):
    print(f"Type of audio: {type(audio_file)}, Value of audio: {audio_file}")  # Debug line

    # Use the ASR pipeline to transcribe the audio
    with open(audio_file.name, "rb") as f:  # Change this line
        data = f.read()
    output = query(ASR_API_URL, data=data)
    transcription = output["text"]

    # Use the translation pipeline to translate the transcription
    translated_text = query(TRANSLATION_API_URL, {"inputs": transcription})

    # Use the TTS pipeline to synthesize the translated text
    response = requests.post(TTS_API_URL, headers=headers, json={"inputs": translated_text})
    audio_bytes = response.content

    # Convert the audio bytes to an audio segment
    audio_segment = AudioSegment.from_mp3(io.BytesIO(audio_bytes))  # Change this line

    # Convert the audio segment to a numpy array
    audio_data = np.array(audio_segment.get_array_of_samples())
    if audio_segment.channels == 2:
        audio_data = audio_data.reshape((-1, 2))

    return audio_data

# Define the Gradio interface
iface = gr.Interface(
    fn=translate_speech, 
    inputs=gr.inputs.File(type="file"),  # Change this line
    outputs=gr.outputs.Audio(type="numpy"),
    title="Hausa to English Translation",
    description="Realtime demo for Hausa to English translation using speech recognition and text-to-speech synthesis."
)

iface.launch()