File size: 1,953 Bytes
d347764 b2c7d3a 382ed84 d347764 b2c7d3a d347764 b2c7d3a c714a80 d347764 b2c7d3a d347764 b2c7d3a 17cfe18 dd785c2 17cfe18 dd785c2 b2c7d3a 17cfe18 b2c7d3a 382ed84 b2c7d3a d347764 b2c7d3a d347764 b2c7d3a d347764 b2c7d3a d347764 17cfe18 b2c7d3a 382ed84 b2c7d3a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
import gradio as gr
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForTextToWaveform
import torch
# Load your pretrained models
asr_model = Wav2Vec2ForCTC.from_pretrained("Baghdad99/saad-speech-recognition-hausa-audio-to-text")
asr_processor = Wav2Vec2Processor.from_pretrained("Baghdad99/saad-speech-recognition-hausa-audio-to-text")
# Load the Hausa translation model
translation_tokenizer = AutoTokenizer.from_pretrained("Baghdad99/saad-hausa-text-to-english-text")
translation_model = AutoModelForSeq2SeqLM.from_pretrained("Baghdad99/saad-hausa-text-to-english-text", from_tf=True)
# Load the Text-to-Speech model
tts_tokenizer = AutoTokenizer.from_pretrained("Baghdad99/english_voice_tts")
tts_model = AutoModelForTextToWaveform.from_pretrained("Baghdad99/english_voice_tts")
def translate_speech(speech):
# Extract the audio signal and sample rate
audio_signal, sample_rate = speech
# Convert stereo to mono if necessary
if len(audio_signal.shape) > 1:
audio_signal = audio_signal.mean(axis=0)
# Transcribe the speech to text
inputs = asr_processor(audio_signal, return_tensors="pt", padding=True)
logits = asr_model(inputs.input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = asr_processor.decode(predicted_ids[0])
# Translate the text
translated = translation_model.generate(**translation_tokenizer(transcription, return_tensors="pt", padding=True))
translated_text = [translation_tokenizer.decode(t, skip_special_tokens=True) for t in translated]
# Convert the translated text to speech
inputs = tts_tokenizer(translated_text, return_tensors='pt')
audio = tts_model.generate(inputs['input_ids'])
return audio
# Define the Gradio interface
iface = gr.Interface(fn=translate_speech, inputs=gr.inputs.Audio(source="microphone", type="numpy"), outputs="audio")
iface.launch()
|