Spaces:

Baghdad99
/

ha-en

Sleeping

File size: 2,229 Bytes

5b74a4b
 
5c4fa2e
5b74a4b
 
 
 
 
 
 
 
 
 
 
425531b
 
 
5b74a4b
a48f8e0
 
5b74a4b
a48f8e0
 
 
 
 
 
5b74a4b
a48f8e0
5b74a4b
a48f8e0
 
 
 
5b74a4b
a48f8e0
 
 
 
 
 
 
 
 
 
 
 
 
 
17cfe18
a5ec736
b2c7d3a
5b74a4b
 
eaff29b
5b74a4b
 
 
 
 
b2c7d3a

import gradio as gr
from transformers import pipeline
import numpy as np

# Load the pipeline for speech recognition and translation
pipe = pipeline(
    "automatic-speech-recognition",
    model="Baghdad99/saad-speech-recognition-hausa-audio-to-text",
    tokenizer="Baghdad99/saad-speech-recognition-hausa-audio-to-text"
)
translator = pipeline("text2text-generation", model="Baghdad99/saad-hausa-text-to-english-text")
tts = pipeline("text-to-speech", model="Baghdad99/english_voice_tts")

def translate_speech(audio):
    # Separate the sample rate and the audio data
    sample_rate, audio_data = audio

    # Use the speech recognition pipeline to transcribe the audio
    output = pipe(audio_data)
    print(f"Output: {output}")  # Print the output to see what it contains

    # Check if the output contains 'transcription'
    if 'transcription' in output:
        transcription = output["transcription"]
    else:
        print("The output does not contain 'transcription'")
        return

    # Rest of your code...

# # Define the function to translate speech
# def translate_speech(audio):
#     # Separate the sample rate and the audio data
#     sample_rate, audio_data = audio

#     # Use the speech recognition pipeline to transcribe the audio
#     transcription = pipe(audio_data)["transcription"]

#     # Use the translation pipeline to translate the transcription
#     translated_text = translator(transcription, return_tensors="pt", padding=True)

#     # Use the text-to-speech pipeline to synthesize the translated text
#     synthesised_speech = tts(translated_text, return_tensors='pt')

#     # Define the max_range variable
#     max_range = 32767  # You can adjust this value based on your requirements
#     synthesised_speech = (synthesised_speech.numpy() * max_range).astype(np.int16)

#     return 16000, synthesised_speech


# Define the Gradio interface
iface = gr.Interface(
    fn=translate_speech, 
    inputs=gr.inputs.Audio(source="microphone", type="numpy"), 
    outputs=gr.outputs.Audio(type="numpy"),
    title="Hausa to English Translation",
    description="Realtime demo for Hausa to English translation using speech recognition and text-to-speech synthesis."
)

iface.launch()