Spaces:

Baghdad99
/

ha-en

Sleeping

App Files Files Community

ha-en / app.py

Baghdad99

Update app.py

3077d90 about 1 year ago

raw

history blame

3.11 kB

	import gradio as gr
	from transformers import pipeline, AutoTokenizer
	import numpy as np
	from pydub import AudioSegment
	import librosa
	from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

	# Load the model and processor
	model = Wav2Vec2ForCTC.from_pretrained("Akashpb13/Hausa_xlsr")
	processor = Wav2Vec2Processor.from_pretrained("Akashpb13/Hausa_xlsr")


	translator = pipeline("text2text-generation", model="Baghdad99/saad-hausa-text-to-english-text")
	tts = pipeline("text-to-speech", model="Baghdad99/english_voice_tts")

	def translate_speech(audio_input):
	# Load the audio file as a floating point time series
	audio_data, sample_rate = librosa.load(audio_input, sr=None)

	# Prepare the input dictionary
	input_dict = processor(audio_data, return_tensors="pt", padding=True)

	# Use the model to get the logits
	logits = model(input_dict.input_values.to("cuda")).logits

	# Get the predicted IDs
	pred_ids = torch.argmax(logits, dim=-1)[0]

	# Decode the predicted IDs to get the transcription
	transcription = processor.decode(pred_ids)

	# Use the speech recognition pipeline to transcribe the audio
	output = pipe(audio_data)

	# Check if the output contains 'text'
	if 'text' in output:
	transcription = output["text"]
	print(f"Transcription: {transcription}") # Print the transcription
	else:
	print("The output does not contain 'text'")
	return

	# Use the translation pipeline to translate the transcription
	translated_text = translator(transcription, return_tensors="pt")
	print(f"Translated text: {translated_text}") # Print the translated text

	# Check if the translated text contains 'generated_token_ids'
	if 'generated_token_ids' in translated_text[0]:
	# Decode the tokens into text
	translated_text_str = translator.tokenizer.decode(translated_text[0]['generated_token_ids'])
	print(f"Translated text string: {translated_text_str}") # Print the translated text string
	else:
	print("The translated text does not contain 'generated_token_ids'")
	return

	# Use the text-to-speech pipeline to synthesize the translated text
	synthesised_speech = tts(translated_text_str)

	# Check if the synthesised speech contains 'audio'
	if 'audio' in synthesised_speech:
	synthesised_speech_data = synthesised_speech['audio']
	else:
	print("The synthesised speech does not contain 'audio'")
	return

	# Flatten the audio data
	synthesised_speech_data = synthesised_speech_data.flatten()

	# Scale the audio data to the range of int16 format
	synthesised_speech = (synthesised_speech_data * 32767).astype(np.int16)

	return 16000, synthesised_speech


	# Define the Gradio interface
	iface = gr.Interface(
	fn=translate_speech,
	inputs=gr.inputs.Audio(type="filepath"), # Change this line
	outputs=gr.outputs.Audio(type="numpy"),
	title="Hausa to English Translation",
	description="Realtime demo for Hausa to English translation using speech recognition and text-to-speech synthesis."
	)

	iface.launch()