File size: 2,705 Bytes
5b74a4b d822bd4 5c4fa2e 5b74a4b d822bd4 5b74a4b 72632b9 5b74a4b 425531b 5b74a4b a48f8e0 5b74a4b 72632b9 a48f8e0 72632b9 a48f8e0 5b74a4b 72632b9 a99bdb2 01153e2 5b74a4b 774e76f 01153e2 774e76f 01153e2 1f03166 d822bd4 730fef5 72632b9 ab7bc1a d822bd4 17cfe18 72632b9 a5ec736 b2c7d3a 5b74a4b eaff29b 5b74a4b b2c7d3a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
import gradio as gr
from transformers import pipeline, VitsModel, AutoTokenizer
import torch
import numpy as np
# Load the pipeline for speech recognition and translation
pipe = pipeline(
"automatic-speech-recognition",
model="Baghdad99/saad-speech-recognition-hausa-audio-to-text",
tokenizer="Baghdad99/saad-speech-recognition-hausa-audio-to-text"
)
translator = pipeline("text2text-generation", model="Baghdad99/saad-hausa-text-to-english-text")
# Load the VITS model for text-to-speech synthesis
tts_model = VitsModel.from_pretrained("Baghdad99/english_voice_tts")
tts_tokenizer = AutoTokenizer.from_pretrained("Baghdad99/english_voice_tts")
# Define the function to translate speech
def translate_speech(audio):
# Separate the sample rate and the audio data
sample_rate, audio_data = audio
# Use the speech recognition pipeline to transcribe the audio
output = pipe(audio_data)
print(f"Output: {output}") # Print the output to see what it contains
# Check if the output contains 'text'
if 'text' in output:
transcription = output["text"]
else:
print("The output does not contain 'text'")
return
# Use the translation pipeline to translate the transcription
translated_text = translator(transcription, return_tensors="pt")
print(f"Translated text: {translated_text}") # Print the translated text to see what it contains
# Check if the translated text contains 'generated_token_ids'
if 'generated_token_ids' in translated_text[0]:
# Decode the tokens into text
translated_text_str = translator.tokenizer.decode(translated_text[0]['generated_token_ids'])
else:
print("The translated text does not contain 'generated_token_ids'")
return
# Use the VITS model to synthesize the translated text
tts_inputs = tts_tokenizer(translated_text_str, return_tensors="pt")
with torch.no_grad():
synthesised_speech = tts_model(**tts_inputs).waveform
print(f"Synthesised speech: {synthesised_speech}") # Print the synthesised speech to see what it contains
# Define the max_range variable
max_range = 1.0 # You can adjust this value based on your requirements
synthesised_speech = (synthesised_speech.numpy() * max_range).astype(np.float32)
return 16000, synthesised_speech
# Define the Gradio interface
iface = gr.Interface(
fn=translate_speech,
inputs=gr.inputs.Audio(source="microphone", type="numpy"),
outputs=gr.outputs.Audio(type="numpy"),
title="Hausa to English Translation",
description="Realtime demo for Hausa to English translation using speech recognition and text-to-speech synthesis."
)
iface.launch()
|