File size: 2,786 Bytes
5b74a4b fcc244c 83e3ccb 22cb79a 5b74a4b fcc244c 425531b fcc244c 5b74a4b fcc244c 0622f39 83e3ccb 03b277d 0622f39 404d7f0 83e3ccb fcc244c 83e3ccb fcc244c a48f8e0 fcc244c a48f8e0 5b74a4b 72632b9 fcc244c c58bd88 fcc244c 17cfe18 22cb79a 83e3ccb a5ec736 b2c7d3a 5b74a4b 8fe6fd5 5b74a4b b2c7d3a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
import gradio as gr
import requests
import soundfile as sf
import numpy as np
import tempfile
from pydub import AudioSegment
# Define the Hugging Face Inference API URLs and headers
ASR_API_URL = "https://api-inference.huggingface.co/models/Baghdad99/saad-speech-recognition-hausa-audio-to-text"
TTS_API_URL = "https://api-inference.huggingface.co/models/Baghdad99/english_voice_tts"
TRANSLATION_API_URL = "https://api-inference.huggingface.co/models/Baghdad99/saad-hausa-text-to-english-text"
headers = {"Authorization": "Bearer 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"}
# Define the function to query the Hugging Face Inference API
def query(api_url, payload):
response = requests.post(api_url, headers=headers, json=payload)
return response.json()
# Define the function to translate speech
def translate_speech(audio):
print(f"Type of audio: {type(audio)}, Value of audio: {audio}") # Debug line
# audio is a tuple (np.ndarray, int), we need to save it as a file
sample_rate, audio_data = audio
if isinstance(audio_data, np.ndarray) and len(audio_data.shape) == 1: # if audio_data is 1D, reshape it to 2D
audio_data = np.reshape(audio_data, (-1, 1))
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
sf.write(f, audio_data, sample_rate)
audio_file = f.name
# Use the ASR pipeline to transcribe the audio
with open(audio_file, "rb") as f:
data = f.read()
response = requests.post(ASR_API_URL, headers=headers, data=data)
output = response.json()
# Check if the output contains 'text'
if 'text' in output:
transcription = output["text"]
else:
print("The output does not contain 'text'")
return
# Use the translation pipeline to translate the transcription
translated_text = query(TRANSLATION_API_URL, {"inputs": transcription})
# Use the TTS pipeline to synthesize the translated text
response = requests.post(TTS_API_URL, headers=headers, json={"inputs": translated_text})
audio_bytes = response.content
# Convert the audio bytes to an audio segment
audio_segment = AudioSegment.from_file(io.BytesIO(audio_bytes), format="wav")
# Convert the audio segment to a numpy array
audio_data = np.array(audio_segment.get_array_of_samples())
if audio_segment.channels == 2:
audio_data = audio_data.reshape((-1, 2))
return audio_data
# Define the Gradio interface
iface = gr.Interface(
fn=translate_speech,
inputs=gr.inputs.Audio(source="microphone", type="numpy"),
outputs=gr.outputs.Audio(type="numpy"),
title="Hausa to English Translation",
description="Realtime demo for Hausa to English translation using speech recognition and text-to-speech synthesis."
)
iface.launch()
|