File size: 3,110 Bytes
5b74a4b 1ee8cb6 83e3ccb abd2b24 ea3653e 3077d90 5b74a4b 25fb027 a927d1d ea3653e c5fae6e 5add931 3077d90 88de73c 952235c 88de73c 2de6d50 88de73c 5add931 72632b9 25fb027 2de6d50 25fb027 2de6d50 25fb027 c58bd88 8c23bfa 25fb027 17cfe18 25fb027 a5ec736 88de73c b2c7d3a 5b74a4b ef4cfee 8fe6fd5 5b74a4b b2c7d3a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
import gradio as gr
from transformers import pipeline, AutoTokenizer
import numpy as np
from pydub import AudioSegment
import librosa
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
# Load the model and processor
model = Wav2Vec2ForCTC.from_pretrained("Akashpb13/Hausa_xlsr")
processor = Wav2Vec2Processor.from_pretrained("Akashpb13/Hausa_xlsr")
translator = pipeline("text2text-generation", model="Baghdad99/saad-hausa-text-to-english-text")
tts = pipeline("text-to-speech", model="Baghdad99/english_voice_tts")
def translate_speech(audio_input):
# Load the audio file as a floating point time series
audio_data, sample_rate = librosa.load(audio_input, sr=None)
# Prepare the input dictionary
input_dict = processor(audio_data, return_tensors="pt", padding=True)
# Use the model to get the logits
logits = model(input_dict.input_values.to("cuda")).logits
# Get the predicted IDs
pred_ids = torch.argmax(logits, dim=-1)[0]
# Decode the predicted IDs to get the transcription
transcription = processor.decode(pred_ids)
# Use the speech recognition pipeline to transcribe the audio
output = pipe(audio_data)
# Check if the output contains 'text'
if 'text' in output:
transcription = output["text"]
print(f"Transcription: {transcription}") # Print the transcription
else:
print("The output does not contain 'text'")
return
# Use the translation pipeline to translate the transcription
translated_text = translator(transcription, return_tensors="pt")
print(f"Translated text: {translated_text}") # Print the translated text
# Check if the translated text contains 'generated_token_ids'
if 'generated_token_ids' in translated_text[0]:
# Decode the tokens into text
translated_text_str = translator.tokenizer.decode(translated_text[0]['generated_token_ids'])
print(f"Translated text string: {translated_text_str}") # Print the translated text string
else:
print("The translated text does not contain 'generated_token_ids'")
return
# Use the text-to-speech pipeline to synthesize the translated text
synthesised_speech = tts(translated_text_str)
# Check if the synthesised speech contains 'audio'
if 'audio' in synthesised_speech:
synthesised_speech_data = synthesised_speech['audio']
else:
print("The synthesised speech does not contain 'audio'")
return
# Flatten the audio data
synthesised_speech_data = synthesised_speech_data.flatten()
# Scale the audio data to the range of int16 format
synthesised_speech = (synthesised_speech_data * 32767).astype(np.int16)
return 16000, synthesised_speech
# Define the Gradio interface
iface = gr.Interface(
fn=translate_speech,
inputs=gr.inputs.Audio(type="filepath"), # Change this line
outputs=gr.outputs.Audio(type="numpy"),
title="Hausa to English Translation",
description="Realtime demo for Hausa to English translation using speech recognition and text-to-speech synthesis."
)
iface.launch()
|