File size: 2,705 Bytes
5b74a4b
d822bd4
 
5c4fa2e
5b74a4b
 
 
 
 
 
 
 
d822bd4
 
 
 
5b74a4b
72632b9
5b74a4b
425531b
 
 
5b74a4b
a48f8e0
 
5b74a4b
72632b9
 
 
a48f8e0
72632b9
a48f8e0
5b74a4b
72632b9
a99bdb2
01153e2
5b74a4b
774e76f
 
 
 
01153e2
774e76f
01153e2
1f03166
d822bd4
 
 
 
730fef5
 
72632b9
ab7bc1a
d822bd4
17cfe18
72632b9
a5ec736
b2c7d3a
5b74a4b
 
eaff29b
5b74a4b
 
 
 
 
b2c7d3a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import gradio as gr
from transformers import pipeline, VitsModel, AutoTokenizer
import torch
import numpy as np

# Load the pipeline for speech recognition and translation
pipe = pipeline(
    "automatic-speech-recognition",
    model="Baghdad99/saad-speech-recognition-hausa-audio-to-text",
    tokenizer="Baghdad99/saad-speech-recognition-hausa-audio-to-text"
)
translator = pipeline("text2text-generation", model="Baghdad99/saad-hausa-text-to-english-text")

# Load the VITS model for text-to-speech synthesis
tts_model = VitsModel.from_pretrained("Baghdad99/english_voice_tts")
tts_tokenizer = AutoTokenizer.from_pretrained("Baghdad99/english_voice_tts")

# Define the function to translate speech
def translate_speech(audio):
    # Separate the sample rate and the audio data
    sample_rate, audio_data = audio

    # Use the speech recognition pipeline to transcribe the audio
    output = pipe(audio_data)
    print(f"Output: {output}")  # Print the output to see what it contains

    # Check if the output contains 'text'
    if 'text' in output:
        transcription = output["text"]
    else:
        print("The output does not contain 'text'")
        return

    # Use the translation pipeline to translate the transcription
    translated_text = translator(transcription, return_tensors="pt")
    print(f"Translated text: {translated_text}")  # Print the translated text to see what it contains

    # Check if the translated text contains 'generated_token_ids'
    if 'generated_token_ids' in translated_text[0]:
        # Decode the tokens into text
        translated_text_str = translator.tokenizer.decode(translated_text[0]['generated_token_ids'])
    else:
        print("The translated text does not contain 'generated_token_ids'")
        return

    # Use the VITS model to synthesize the translated text
    tts_inputs = tts_tokenizer(translated_text_str, return_tensors="pt")
    with torch.no_grad():
        synthesised_speech = tts_model(**tts_inputs).waveform
    print(f"Synthesised speech: {synthesised_speech}")  # Print the synthesised speech to see what it contains

    # Define the max_range variable
    max_range = 1.0  # You can adjust this value based on your requirements
    synthesised_speech = (synthesised_speech.numpy() * max_range).astype(np.float32)

    return 16000, synthesised_speech

# Define the Gradio interface
iface = gr.Interface(
    fn=translate_speech, 
    inputs=gr.inputs.Audio(source="microphone", type="numpy"), 
    outputs=gr.outputs.Audio(type="numpy"),
    title="Hausa to English Translation",
    description="Realtime demo for Hausa to English translation using speech recognition and text-to-speech synthesis."
)

iface.launch()