File size: 1,953 Bytes
d347764
b2c7d3a
382ed84
d347764
b2c7d3a
 
 
d347764
b2c7d3a
 
c714a80
d347764
b2c7d3a
 
 
d347764
b2c7d3a
17cfe18
 
 
dd785c2
17cfe18
 
dd785c2
b2c7d3a
17cfe18
b2c7d3a
382ed84
b2c7d3a
d347764
b2c7d3a
 
 
d347764
b2c7d3a
 
 
d347764
b2c7d3a
d347764
17cfe18
b2c7d3a
382ed84
b2c7d3a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import gradio as gr
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForTextToWaveform
import torch

# Load your pretrained models
asr_model = Wav2Vec2ForCTC.from_pretrained("Baghdad99/saad-speech-recognition-hausa-audio-to-text")
asr_processor = Wav2Vec2Processor.from_pretrained("Baghdad99/saad-speech-recognition-hausa-audio-to-text")

# Load the Hausa translation model
translation_tokenizer = AutoTokenizer.from_pretrained("Baghdad99/saad-hausa-text-to-english-text")
translation_model = AutoModelForSeq2SeqLM.from_pretrained("Baghdad99/saad-hausa-text-to-english-text", from_tf=True)

# Load the Text-to-Speech model
tts_tokenizer = AutoTokenizer.from_pretrained("Baghdad99/english_voice_tts")
tts_model = AutoModelForTextToWaveform.from_pretrained("Baghdad99/english_voice_tts")

def translate_speech(speech):
    # Extract the audio signal and sample rate
    audio_signal, sample_rate = speech

    # Convert stereo to mono if necessary
    if len(audio_signal.shape) > 1:
        audio_signal = audio_signal.mean(axis=0)

    # Transcribe the speech to text
    inputs = asr_processor(audio_signal, return_tensors="pt", padding=True)
    logits = asr_model(inputs.input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = asr_processor.decode(predicted_ids[0])

    # Translate the text
    translated = translation_model.generate(**translation_tokenizer(transcription, return_tensors="pt", padding=True))
    translated_text = [translation_tokenizer.decode(t, skip_special_tokens=True) for t in translated]

    # Convert the translated text to speech
    inputs = tts_tokenizer(translated_text, return_tensors='pt')
    audio = tts_model.generate(inputs['input_ids'])

    return audio


# Define the Gradio interface
iface = gr.Interface(fn=translate_speech, inputs=gr.inputs.Audio(source="microphone", type="numpy"), outputs="audio")
iface.launch()