File size: 3,837 Bytes
5b74a4b 83e3ccb abd2b24 fb1d852 5b74a4b 25fb027 47c691b 25fb027 a927d1d f3a6e3c 9829b9c fb1d852 5add931 fb1d852 47453aa fb1d852 47453aa fb1d852 25fb027 fb1d852 952235c 25fb027 393002d 25fb027 393002d 25fb027 393002d 5b74a4b 5add931 72632b9 25fb027 5add931 25fb027 c58bd88 8c23bfa 5add931 25fb027 17cfe18 25fb027 a5ec736 5add931 b2c7d3a 5b74a4b f3a6e3c 8fe6fd5 5b74a4b b2c7d3a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
import gradio as gr
import numpy as np
from pydub import AudioSegment
import io
from transformers import pipeline, AutoTokenizer
# Load the pipeline for speech recognition and translation
pipe = pipeline(
"automatic-speech-recognition",
model="Akashpb13/Hausa_xlsr",
tokenizer="Akashpb13/Hausa_xlsr"
)
translator = pipeline("text2text-generation", model="Baghdad99/saad-hausa-text-to-english-text")
tts = pipeline("text-to-speech", model="Baghdad99/english_voice_tts")
def translate_speech(audio_data_tuple):
# Extract the audio data from the tuple
sample_rate, audio_data = audio_data_tuple
# Convert the audio data to int16 format
audio_data_int16 = audio_data.astype(np.int16)
# Create an AudioSegment from the audio data
audio_segment = AudioSegment(
audio_data_int16.tobytes(), # Audio data as bytes
frame_rate=sample_rate,
sample_width=audio_data_int16.dtype.itemsize, # Width in bytes
channels=1
)
# Export the AudioSegment as MP3
mp3_buffer = io.BytesIO()
audio_segment.export(mp3_buffer, format="mp3")
# Now you have an MP3 file in a BytesIO buffer. You can write it to a file,
# send it over a network, etc. Here's how you can write it to a file:
with open("audio.mp3", "wb") as f:
f.write(mp3_buffer.getvalue())
# Now you can feed the MP3 file to your model
# Use the speech recognition pipeline to transcribe the audio
output = pipe("audio.mp3")
print(f"Output: {output}") # Print the output to see what it contains
# Check if the output contains 'text'
if 'text' in output:
transcription = output["text"]
else:
print("The output does not contain 'text'")
return
# Print the transcription
print(f"Transcription: {transcription}")
# Use the translation pipeline to translate the transcription
translated_text = translator(transcription, return_tensors="pt")
print(f"Translated text: {translated_text}") # Print the translated text to see what it contains
# Check if the translated text contains 'generated_token_ids'
if 'generated_token_ids' in translated_text[0]:
# Decode the tokens into text
translated_text_str = translator.tokenizer.decode(translated_text[0]['generated_token_ids'])
else:
print("The translated text does not contain 'generated_token_ids'")
return
# Print the translated text string
print(f"Translated text string: {translated_text_str}")
# Use the text-to-speech pipeline to synthesize the translated text
synthesised_speech = tts(translated_text_str)
print(f"Synthesised speech: {synthesised_speech}") # Print the synthesised speech to see what it contains
# Check if the synthesised speech contains 'audio'
if 'audio' in synthesised_speech:
synthesised_speech_data = synthesised_speech['audio']
else:
print("The synthesised speech does not contain 'audio'")
return
# Flatten the audio data
synthesised_speech_data = synthesised_speech_data.flatten()
# Print the shape and type of the synthesised speech data
print(f"Synthesised speech data type: {type(synthesised_speech_data)}, Synthesised speech data shape: {synthesised_speech_data.shape}")
# Scale the audio data to the range of int16 format
synthesised_speech = (synthesised_speech_data * 32767).astype(np.int16)
return 16000, synthesised_speech
# Define the Gradio interface
iface = gr.Interface(
fn=translate_speech,
inputs=gr.inputs.Audio(source="microphone"), # Change this line
outputs=gr.outputs.Audio(type="numpy"),
title="Hausa to English Translation",
description="Realtime demo for Hausa to English translation using speech recognition and text-to-speech synthesis."
)
iface.launch()
|