File size: 3,837 Bytes
5b74a4b
83e3ccb
abd2b24
fb1d852
 
5b74a4b
25fb027
 
 
47c691b
 
25fb027
 
 
a927d1d
f3a6e3c
 
 
9829b9c
fb1d852
 
 
 
 
 
 
 
 
 
5add931
fb1d852
 
 
47453aa
fb1d852
 
 
 
47453aa
fb1d852
25fb027
fb1d852
952235c
25fb027
393002d
25fb027
 
 
393002d
25fb027
393002d
5b74a4b
5add931
 
 
72632b9
25fb027
 
 
 
 
 
 
 
 
 
 
5add931
 
 
25fb027
 
 
 
 
 
 
 
 
 
c58bd88
8c23bfa
 
 
5add931
 
 
25fb027
 
17cfe18
25fb027
a5ec736
5add931
b2c7d3a
5b74a4b
 
f3a6e3c
8fe6fd5
5b74a4b
 
 
 
b2c7d3a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import gradio as gr
import numpy as np
from pydub import AudioSegment
import io
from transformers import pipeline, AutoTokenizer

# Load the pipeline for speech recognition and translation
pipe = pipeline(
    "automatic-speech-recognition",
    model="Akashpb13/Hausa_xlsr",
    tokenizer="Akashpb13/Hausa_xlsr"
)
translator = pipeline("text2text-generation", model="Baghdad99/saad-hausa-text-to-english-text")
tts = pipeline("text-to-speech", model="Baghdad99/english_voice_tts")

def translate_speech(audio_data_tuple):
    # Extract the audio data from the tuple
    sample_rate, audio_data = audio_data_tuple

    # Convert the audio data to int16 format
    audio_data_int16 = audio_data.astype(np.int16)

    # Create an AudioSegment from the audio data
    audio_segment = AudioSegment(
        audio_data_int16.tobytes(),  # Audio data as bytes
        frame_rate=sample_rate,
        sample_width=audio_data_int16.dtype.itemsize,  # Width in bytes
        channels=1
    )

    # Export the AudioSegment as MP3
    mp3_buffer = io.BytesIO()
    audio_segment.export(mp3_buffer, format="mp3")

    # Now you have an MP3 file in a BytesIO buffer. You can write it to a file,
    # send it over a network, etc. Here's how you can write it to a file:
    with open("audio.mp3", "wb") as f:
        f.write(mp3_buffer.getvalue())

    # Now you can feed the MP3 file to your model
    # Use the speech recognition pipeline to transcribe the audio
    output = pipe("audio.mp3")

    print(f"Output: {output}")  # Print the output to see what it contains

    # Check if the output contains 'text'
    if 'text' in output:
        transcription = output["text"]
    else:
        print("The output does not contain 'text'")
        return

    # Print the transcription
    print(f"Transcription: {transcription}")

    # Use the translation pipeline to translate the transcription
    translated_text = translator(transcription, return_tensors="pt")
    print(f"Translated text: {translated_text}")  # Print the translated text to see what it contains

    # Check if the translated text contains 'generated_token_ids'
    if 'generated_token_ids' in translated_text[0]:
        # Decode the tokens into text
        translated_text_str = translator.tokenizer.decode(translated_text[0]['generated_token_ids'])
    else:
        print("The translated text does not contain 'generated_token_ids'")
        return

    # Print the translated text string
    print(f"Translated text string: {translated_text_str}")

    # Use the text-to-speech pipeline to synthesize the translated text
    synthesised_speech = tts(translated_text_str)
    print(f"Synthesised speech: {synthesised_speech}")  # Print the synthesised speech to see what it contains

    # Check if the synthesised speech contains 'audio'
    if 'audio' in synthesised_speech:
        synthesised_speech_data = synthesised_speech['audio']
    else:
        print("The synthesised speech does not contain 'audio'")
        return

    # Flatten the audio data
    synthesised_speech_data = synthesised_speech_data.flatten()

    # Print the shape and type of the synthesised speech data
    print(f"Synthesised speech data type: {type(synthesised_speech_data)}, Synthesised speech data shape: {synthesised_speech_data.shape}")

    # Scale the audio data to the range of int16 format
    synthesised_speech = (synthesised_speech_data * 32767).astype(np.int16)

    return 16000, synthesised_speech


# Define the Gradio interface
iface = gr.Interface(
    fn=translate_speech, 
    inputs=gr.inputs.Audio(source="microphone"),  # Change this line
    outputs=gr.outputs.Audio(type="numpy"),
    title="Hausa to English Translation",
    description="Realtime demo for Hausa to English translation using speech recognition and text-to-speech synthesis."
)

iface.launch()