File size: 3,110 Bytes
5b74a4b
1ee8cb6
83e3ccb
abd2b24
ea3653e
3077d90
 
 
 
 
 
5b74a4b
25fb027
 
a927d1d
ea3653e
c5fae6e
 
5add931
3077d90
 
 
 
 
 
 
 
 
 
 
 
88de73c
 
952235c
88de73c
 
 
2de6d50
88de73c
 
 
5add931
72632b9
25fb027
2de6d50
25fb027
 
 
 
 
2de6d50
25fb027
 
 
 
 
 
 
 
 
 
 
 
 
c58bd88
8c23bfa
 
 
25fb027
 
17cfe18
25fb027
a5ec736
88de73c
b2c7d3a
5b74a4b
 
ef4cfee
8fe6fd5
5b74a4b
 
 
 
b2c7d3a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import gradio as gr
from transformers import pipeline, AutoTokenizer
import numpy as np
from pydub import AudioSegment
import librosa
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

# Load the model and processor
model = Wav2Vec2ForCTC.from_pretrained("Akashpb13/Hausa_xlsr")
processor = Wav2Vec2Processor.from_pretrained("Akashpb13/Hausa_xlsr")


translator = pipeline("text2text-generation", model="Baghdad99/saad-hausa-text-to-english-text")
tts = pipeline("text-to-speech", model="Baghdad99/english_voice_tts")

def translate_speech(audio_input):
    # Load the audio file as a floating point time series
    audio_data, sample_rate = librosa.load(audio_input, sr=None)

    # Prepare the input dictionary
    input_dict = processor(audio_data, return_tensors="pt", padding=True)

    # Use the model to get the logits
    logits = model(input_dict.input_values.to("cuda")).logits

    # Get the predicted IDs
    pred_ids = torch.argmax(logits, dim=-1)[0]

    # Decode the predicted IDs to get the transcription
    transcription = processor.decode(pred_ids)

    # Use the speech recognition pipeline to transcribe the audio
    output = pipe(audio_data)

    # Check if the output contains 'text'
    if 'text' in output:
        transcription = output["text"]
        print(f"Transcription: {transcription}")  # Print the transcription
    else:
        print("The output does not contain 'text'")
        return

    # Use the translation pipeline to translate the transcription
    translated_text = translator(transcription, return_tensors="pt")
    print(f"Translated text: {translated_text}")  # Print the translated text

    # Check if the translated text contains 'generated_token_ids'
    if 'generated_token_ids' in translated_text[0]:
        # Decode the tokens into text
        translated_text_str = translator.tokenizer.decode(translated_text[0]['generated_token_ids'])
        print(f"Translated text string: {translated_text_str}")  # Print the translated text string
    else:
        print("The translated text does not contain 'generated_token_ids'")
        return

    # Use the text-to-speech pipeline to synthesize the translated text
    synthesised_speech = tts(translated_text_str)

    # Check if the synthesised speech contains 'audio'
    if 'audio' in synthesised_speech:
        synthesised_speech_data = synthesised_speech['audio']
    else:
        print("The synthesised speech does not contain 'audio'")
        return

    # Flatten the audio data
    synthesised_speech_data = synthesised_speech_data.flatten()

    # Scale the audio data to the range of int16 format
    synthesised_speech = (synthesised_speech_data * 32767).astype(np.int16)

    return 16000, synthesised_speech


# Define the Gradio interface
iface = gr.Interface(
    fn=translate_speech, 
    inputs=gr.inputs.Audio(type="filepath"),  # Change this line
    outputs=gr.outputs.Audio(type="numpy"),
    title="Hausa to English Translation",
    description="Realtime demo for Hausa to English translation using speech recognition and text-to-speech synthesis."
)

iface.launch()